Im trying to scrape all comments, the authors and the time under a Steam mod using scrapy and scrapy-playwright. But I am getting the first comment per page, on top of that its very slow. I am still very much a beginner in Python and in Webscraping in general so it being slow is not a big deal yet but how can I get it to scrape every comment for every page before moving on to the next one?
Here is my code:
import scrapy
import asyncio
from scrapy_playwright.page import PageMethod
from ..items import WorkshopCommentsItem
class A81tilesSpider(scrapy.Spider):
name = '81tiles'
start_urls = ['https://steamcommunity.com/sharedfiles/filedetails/?id=2881031511']
def start_requests(self):
yield scrapy.Request(
url=self.start_urls[0],
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_methods": [
PageMethod("wait_for_selector", "div.commentthread_comment_container", timeout=60000)
],
"page_number": 1
}
)
async def parse(self, response):
page = response.meta["playwright_page"]
page_number = response.meta["page_number"]
print(f"Processing page {page_number}")
while True:
# Deal with cookie and press reject
try:
cookie_popup = await page.query_selector('#rejectAllButton')
await page.wait_for_selector('#rejectAllButton', state='visible', timeout=40000)
await cookie_popup.click()
await page.wait_for_selector(
"#commentthread_PublishedFile_Public_76561198262198841_2881031511_fpagebtn_next", state='visible',
timeout=40000)
except:
pass
# Wait for comments to load
await page.wait_for_selector('div.commentthread_comment_container')
# Scrape comments on current page
all_div_comments = await page.query_selector_all('div.commentthread_comment_container')
print(f"Number of comments evaluated on page {page_number}: {len(all_div_comments)}")
for comments in all_div_comments:
steam_item = WorkshopCommentsItem()
steam_item['post_time'] = await (
await comments.query_selector('.commentthread_comment_timestamp')).inner_text()
steam_item['post_content'] = await (
await comments.query_selector('.commentthread_comment_text')).inner_text()
steam_item['post_author'] = await (await comments.query_selector('bdi')).inner_text()
yield steam_item
await asyncio.sleep(10)
# Check if there are more pages
next_page_available = await page.evaluate(
"(function(){return Boolean(document.querySelector('#commentthread_PublishedFile_Public_76561198262198841_2881031511_fpagebtn_next:not([disabled])'));})()")
if next_page_available:
# Click on the next page button
print("Clicking on Next button")
await page.click("#commentthread_PublishedFile_Public_76561198262198841_2881031511_fpagebtn_next")
# Increment the page number
page_number += 1
print(f"Processing page {page_number}")
# Update the meta and wait for the new page to load
response.meta["page_number"] = page_number
try:
await page.wait_for_selector("div.commentthread_comment_container", timeout=60000)
except:
pass
else:
break
await page.close()
I tried to evaluate the comments before scraping but its also just showing 1 comment when evaluating. I slowed it down with timeout because the browser would close before scraping leading to a asycio error.
I recommend using scrapy to parse the page for extracting it's contents instead of using the playwright API. And the reason why it is so slow is because that you are telling it to go that slow.
For example, in your start_requests your request is set with a timeout of 60000... which is 1 minute. then you have it set to timeout after 40 seconds in the cookie popup section. So each page is likely going to take at least 40 seconds before you even start parsing. I suggest reducing those down.
In order to use the scrapy parsing, you can simply query the page content with the playwright page, and then stick the return value in a scrapy selector and use the xpath and css selectors which are much faster and easier to use in my opinion. Here is an example:
Use caution, at this speed you could get your ip address banned from the site.
import scrapy
import asyncio
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector
from ..items import WorkshopCommentsItem
class A81tilesSpider(scrapy.Spider):
name = '81tiles'
start_urls = ['https://steamcommunity.com/sharedfiles/filedetails/?id=2881031511']
def start_requests(self):
yield scrapy.Request(
url=self.start_urls[0],
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_methods": [
PageMethod("wait_for_selector", "div.commentthread_comment_container", timeout=600)
],
"page_number": 1
}
)
async def parse(self, response):
page = response.meta["playwright_page"]
page_number = response.meta["page_number"]
while True:
try:
cookie_popup = await page.query_selector('#rejectAllButton')
await page.wait_for_selector('#rejectAllButton', state='visible', timeout=400)
await cookie_popup.click()
await page.wait_for_selector(
"#commentthread_PublishedFile_Public_76561198262198841_2881031511_fpagebtn_next", state='visible',
timeout=400)
except:
pass
await page.wait_for_selector('div.commentthread_comment_container')
content = await page.content() # get the page content
selector = Selector(text=content) # stick it in a scrapy selector
for comment in selector.css("div.commentthread_comment"):
steam_item = WorkshopCommentsItem(
post_time = comment.xpath(".//span[@class='commentthread_comment_timestamp']/text()").get().strip(),
post_content = comment.xpath(".//div[@class='commentthread_comment_text']/text()").get().strip(),
post_author = comment.xpath(".//bdi/text()").get().strip()
)
yield steam_item
# Check if there are more pages
next_page_available = await page.evaluate(
"(function(){return Boolean(document.querySelector('#commentthread_PublishedFile_Public_76561198262198841_2881031511_fpagebtn_next:not([disabled])'));})()")
if next_page_available:
print("Clicking on Next button")
await page.click("#commentthread_PublishedFile_Public_76561198262198841_2881031511_fpagebtn_next")
page_number += 1
print(f"Processing page {page_number}")
response.meta["page_number"] = page_number
try:
await page.wait_for_selector("div.commentthread_comment_container", timeout=600)
except:
pass
else:
break
await page.close()
With the above code I was able to get every comment from each page.
It took about 4 seconds to extract over 1000 comments, so like I said use caution.
Partial_Output
{"post_time": "17 hours ago", "post_content": "so I don't need the original to use the fix?", "post_author": "LeeTG3"},
{"post_time": "Apr 7 @ 1:57am", "post_content": "@LeeTG3 remember that the Fix description where never updated and maintained... look in CR at the additional information for the fix... it explains the possible solutions.", "post_author": "Chamëleon TBN"},
{"post_time": "Apr 6 @ 4:36pm", "post_content": "It still says to unsubscribe from plopable asphalt even though the fix mod says that you need it, it should just say to disable it in the content manager", "post_author": "LeeTG3"},
{"post_time": "Apr 6 @ 3:50am", "post_content": "@Oldhip - as described in the workshop description text above: It creates a report, that you can open, read and follow the advices regardin to your mods....", "post_author": "Chamëleon TBN"},
{"post_time": "Apr 6 @ 3:19am", "post_content": "Does this show which mods installed are troublesome ? or do I have to have a list of what I have installed?", "post_author": "Oldhip"},
{"post_time": "Apr 5 @ 7:50pm", "post_content": "@Chamëleon TBN -- yes, and according to the Compatibility Report all mods should be running just fine. It's not a big deal for me because FPS Display gives me the only number in which I'm really interested. But I get nothing from Monitor It.", "post_author": "cahubble09"},