Search code examples
pythonweb-scrapingscrapyscrapy-pipelinescrapy-item

How to Loop URL from A List in Scrappy and output only the response body to be downloaded into a XML/TXT file


I have this issue where I have tried the Pipeline method but I am not sure if I am doing it right based on tutorial since most pick some portions from the response.body using selectors.

I however can parse it on a seperate script that gives me all the data that I need given that the data is jumbled up by other variables. Therefore I only need my scrapy to dump the response.body into either .XML or .TXT

I can do it when it is a single url but the moment i introduce various URL it overwrites the final parse. I believe there might be a simpler workaround without using the Pipelines/Items.py given that I am only needing the response.body.

Forgive the indentations cause it was hard to copy it over.

linkarr = df['URLOUT'].tolist()
today = datetime.today().strftime('%Y%m%d')

class MpvticketSpider(scrapy.Spider):

    name = 'mpvticket'   
    start_urls = url
    handle_httpstatus_list = [403,502,503,404]

    def start_requests(self):

        for url in linkarr:

            eventid = str(url).strip().split("pid=")[1].split("&")[0]
            filename_xml = str(eventid) + "_" + str(today) + ".xml"
            filename_txt = str(eventid) + "_" + str(today) + ".txt"
            
            print("\n FIRST  URL BEING RUN: ",url)
            pid = str(url).split("pid=")[1].split('&')[0]
            username = 'XXXX'
            password = 'XXXX'
            port = 22225
            session_id = random.random()
            super_proxy_url = ('http://%s-country-us-session-%s:%[email protected]:%d' %
                (username, session_id, password, port))

            headers = {
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'accept-language': 'en-US,en;q=0.9',
                'cache-control': 'max-age=0',
                'referer': 'https://www.mlb.com/',
                'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': '"Windows"',
                'sec-fetch-dest': 'document',
                'sec-fetch-mode': 'navigate',
                'sec-fetch-site': 'same-origin',
                'sec-fetch-user': '?1',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
            }
            yield scrapy.Request(url, callback=self.parse_api,meta={'proxy': super_proxy_url},headers=headers)

        def parse_api(self,response):
            item = TicketsItem()    
            raw_data = response.body
            soup = BeautifulSoup(raw_data,'lxml')
            item['data'] = soup
            yield item
            #Commented portion was the original method. But overwrote my Output.xml
            #try:
            #    with open(filename_xml, "w") as f:
            #        f.write(str(soup))
            #except:
            #    with open(filename_txt, 'w') as f:
            #            f.write(str(soup))

if __name__ == '__main__':
    process = CrawlerProcess()
    process.crawl(MpvticketSpider)
    process.start()

```

UPDATE:
#Imports
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from sys import path
from scrapy.loader import itemloaders
path.append(r'D:\Projects\tickets')
from tickets.items import TicketsItem

#class MpvticketSpider(scrapy.Spider):

    name = 'mpvticket'   
    handle_httpstatus_list = [403,502,503,404]


    
    def start_requests(self):

        #for url in linkarr:
        url = 'https://mpv.tickets.com/api/pvodc/v1/events/navmap/availability/?pid=9016692&agency=MLB_MPV&orgId=10&supportsVoucherRedemption=true'
        


        print("\n FIRST  URL BEING RUN: ",url)
        username = 'XXXX'
        password = 'XXXX'
        port = 22225
        session_id = random.random()
        super_proxy_url = ('http://%s-country-us-session-%s:%[email protected]:%d' %
        (username, session_id, password, port))

        headers = {
                   #headers 
        }


        yield scrapy.Request(url, callback=self.parse_api,meta={'proxy': super_proxy_url})

    def parse_api(self,response):
        url = response.url
        eventid = str(url).strip().split("pid=")[1].split("&")[0] 
        filename_xml = str(eventid) + "_" + str(today) + ".xml"
        data = response.xpath("//body")
        item = TicketsItem()
        item['data'] = data
        item['filename_xml'] = filename_xml
        yield item
  ```
Pipelines.py
```
from re import I
from itemadapter import ItemAdapter
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.files import FilesPipeline


class TicketsPipeline:

    def process_item(self, item, spider):
        for filename in item['filename_xml']:
            with open(filename, "w") as fd:
                fd.write(item['data'])
    
    raise DropItem
```
item.py
```
import scrapy
from scrapy.loader import itemloaders
from itemloaders.processors import MapCompose




class TicketsItem(scrapy.Item):
    filename_xml = scrapy.Field()
    data = scrapy.Field()
```
Not sure what Is wrong but I am now getting the following error:

Traceback (most recent call last):
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 206, in crawl
    return self._crawl(crawler, *args, **kwargs)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 210, in _crawl
    d = crawler.crawl(*args, **kwargs)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1905, in unwindGenerator
    return _cancellableInlineCallbacks(gen)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1815, in _cancellableInlineCallbacks
    _inlineCallbacks(None, gen, status)
--- <exception caught here> ---
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1660, in _inlineCallbacks
    result = current_context.run(gen.send, result)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 102, in crawl
    self.engine = self._create_engine()
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 116, in _create_engine
    return ExecutionEngine(self, lambda _: self.stop())
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\engine.py", line 84, in __init__
    self.scraper = Scraper(crawler)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\scraper.py", line 75, in __init__
    self.itemproc = itemproc_cls.from_crawler(crawler)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\middleware.py", line 59, in from_crawler
    return cls.from_settings(crawler.settings, crawler)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\middleware.py", line 40, in from_settings
    mwcls = load_object(clspath)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\misc.py", line 61, in load_object
    mod = import_module(module)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\importlib\__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import

  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load

  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked

  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked

  File "<frozen importlib._bootstrap_external>", line 883, in exec_module

  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed

  File "D:\Projects\tickets\tickets\pipelines.py", line 15, in <module>
    class TicketsPipeline:
  File "D:\Projects\tickets\tickets\pipelines.py", line 22, in TicketsPipeline
    raise DropItem
scrapy.exceptions.DropItem:

Solution

  • You should move the logic for what the filename/output path should be into your parse method, and then add the it as a field to your yielded item. Then in your item pipeline you can save the body to the output path and drop the item since there is no need for further processing at that point.

    so change your parse method to something like this:

    def parse_api(self,response):
        url = response.url
        eventid = str(url).strip().split("pid=")[1].split("&")[0] 
        filename_xml = str(eventid) + "_" + str(today) + ".xml"
        data = response.xpath("//body").get()
        item = TicketsItem()
        item.data = data
        item.filename_xml = filename_xml
        yield item
    

    You would need to change your item to something like this:

    class TicketsItem(scrapy.Item):
        filename_xml = scrapy.Field()
        data = scrapy.Field()
    

    Then your items pipeline could look like this:

    from scrapy.exceptions import DropItem
    
    class SpiderPipeline:
    
        def process_item(self, item, spider):
            with open(item["filename_xml"], "w") as fd:
                fd.write(item.data)
            raise DropItem