I have this issue where I have tried the Pipeline method but I am not sure if I am doing it right based on tutorial since most pick some portions from the response.body using selectors.
I however can parse it on a seperate script that gives me all the data that I need given that the data is jumbled up by other variables. Therefore I only need my scrapy to dump the response.body into either .XML or .TXT
I can do it when it is a single url but the moment i introduce various URL it overwrites the final parse. I believe there might be a simpler workaround without using the Pipelines/Items.py given that I am only needing the response.body.
Forgive the indentations cause it was hard to copy it over.
linkarr = df['URLOUT'].tolist()
today = datetime.today().strftime('%Y%m%d')
class MpvticketSpider(scrapy.Spider):
name = 'mpvticket'
start_urls = url
handle_httpstatus_list = [403,502,503,404]
def start_requests(self):
for url in linkarr:
eventid = str(url).strip().split("pid=")[1].split("&")[0]
filename_xml = str(eventid) + "_" + str(today) + ".xml"
filename_txt = str(eventid) + "_" + str(today) + ".txt"
print("\n FIRST URL BEING RUN: ",url)
pid = str(url).split("pid=")[1].split('&')[0]
username = 'XXXX'
password = 'XXXX'
port = 22225
session_id = random.random()
super_proxy_url = ('http://%s-country-us-session-%s:%[email protected]:%d' %
(username, session_id, password, port))
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'referer': 'https://www.mlb.com/',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
yield scrapy.Request(url, callback=self.parse_api,meta={'proxy': super_proxy_url},headers=headers)
def parse_api(self,response):
item = TicketsItem()
raw_data = response.body
soup = BeautifulSoup(raw_data,'lxml')
item['data'] = soup
yield item
#Commented portion was the original method. But overwrote my Output.xml
#try:
# with open(filename_xml, "w") as f:
# f.write(str(soup))
#except:
# with open(filename_txt, 'w') as f:
# f.write(str(soup))
if __name__ == '__main__':
process = CrawlerProcess()
process.crawl(MpvticketSpider)
process.start()
```
UPDATE:
#Imports
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from sys import path
from scrapy.loader import itemloaders
path.append(r'D:\Projects\tickets')
from tickets.items import TicketsItem
#class MpvticketSpider(scrapy.Spider):
name = 'mpvticket'
handle_httpstatus_list = [403,502,503,404]
def start_requests(self):
#for url in linkarr:
url = 'https://mpv.tickets.com/api/pvodc/v1/events/navmap/availability/?pid=9016692&agency=MLB_MPV&orgId=10&supportsVoucherRedemption=true'
print("\n FIRST URL BEING RUN: ",url)
username = 'XXXX'
password = 'XXXX'
port = 22225
session_id = random.random()
super_proxy_url = ('http://%s-country-us-session-%s:%[email protected]:%d' %
(username, session_id, password, port))
headers = {
#headers
}
yield scrapy.Request(url, callback=self.parse_api,meta={'proxy': super_proxy_url})
def parse_api(self,response):
url = response.url
eventid = str(url).strip().split("pid=")[1].split("&")[0]
filename_xml = str(eventid) + "_" + str(today) + ".xml"
data = response.xpath("//body")
item = TicketsItem()
item['data'] = data
item['filename_xml'] = filename_xml
yield item
```
Pipelines.py
```
from re import I
from itemadapter import ItemAdapter
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.files import FilesPipeline
class TicketsPipeline:
def process_item(self, item, spider):
for filename in item['filename_xml']:
with open(filename, "w") as fd:
fd.write(item['data'])
raise DropItem
```
item.py
```
import scrapy
from scrapy.loader import itemloaders
from itemloaders.processors import MapCompose
class TicketsItem(scrapy.Item):
filename_xml = scrapy.Field()
data = scrapy.Field()
```
Not sure what Is wrong but I am now getting the following error:
Traceback (most recent call last):
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 206, in crawl
return self._crawl(crawler, *args, **kwargs)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 210, in _crawl
d = crawler.crawl(*args, **kwargs)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1905, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1815, in _cancellableInlineCallbacks
_inlineCallbacks(None, gen, status)
--- <exception caught here> ---
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1660, in _inlineCallbacks
result = current_context.run(gen.send, result)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 102, in crawl
self.engine = self._create_engine()
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 116, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\engine.py", line 84, in __init__
self.scraper = Scraper(crawler)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\scraper.py", line 75, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\middleware.py", line 59, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\middleware.py", line 40, in from_settings
mwcls = load_object(clspath)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\misc.py", line 61, in load_object
mod = import_module(module)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "D:\Projects\tickets\tickets\pipelines.py", line 15, in <module>
class TicketsPipeline:
File "D:\Projects\tickets\tickets\pipelines.py", line 22, in TicketsPipeline
raise DropItem
scrapy.exceptions.DropItem:
You should move the logic for what the filename/output path should be into your parse method, and then add the it as a field to your yielded item. Then in your item pipeline you can save the body to the output path and drop the item since there is no need for further processing at that point.
so change your parse method to something like this:
def parse_api(self,response):
url = response.url
eventid = str(url).strip().split("pid=")[1].split("&")[0]
filename_xml = str(eventid) + "_" + str(today) + ".xml"
data = response.xpath("//body").get()
item = TicketsItem()
item.data = data
item.filename_xml = filename_xml
yield item
You would need to change your item to something like this:
class TicketsItem(scrapy.Item):
filename_xml = scrapy.Field()
data = scrapy.Field()
Then your items pipeline could look like this:
from scrapy.exceptions import DropItem
class SpiderPipeline:
def process_item(self, item, spider):
with open(item["filename_xml"], "w") as fd:
fd.write(item.data)
raise DropItem