When I running the program, it sticking in while(?) loop. I couldn't understand why. There is the code:
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from multiprocessing.pool import Pool
def urlAmazon(url=str(),paging_method=int()):
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
url = url.format(paging_method)
req = requests.get(url,headers=headers)
soup = BeautifulSoup(req.content,"lxml")
productList= soup.find("div","s-main-slot s-result-list s-search-results sg-row").findAll("div","sg-col-4-of-24 sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20")
if not soup.find("div","a-section a-text-center s-pagination-container"):
exit()
else:
pass
print(paging_method, len(productList))
for product in productList:
productName = product.find("h2").getText(strip=True)
try:
productPrice = product.find("span","a-offscreen").getText(strip=True)
except:
productPrice = None
with open("products.txt","a",encoding="UTF-8") as f:
f.write(f"{productName}: {productPrice} - {paging_method}\n")
def main(pool, loop):
page = 1
url = "https://www.amazon.com.tr/s?i=electronics&bbn=13709880031&rh=n%3A13709880031%2Cp_n_fulfilled_by_amazon%3A21345978031&dc&page={}&rnid=21345970031&ref=sr_pg_2"
while loop:
for i in range(11,-1,-1):
p = pool.apply_async(func=urlAmazon,args=[url,12*page-i])
page+=1
print("End of the code")
if __name__ == '__main__':
pool = Pool()
loop = True
start = time.perf_counter()
main(pool,loop)
finish = time.perf_counter()
print(round(finish-start,2))
So you see, i am in such as a eternal pain right now. I have doubts about "while" loop. Probably it causes the problem though, i cannot solve this problem.
Details: This program allows taking 24x400 products with different pages to 12 page at once. For example:
def main(pool, loop):
page = 1
url = "https://www.amazon.com.tr/s?i=electronics&bbn=13709880031&rh=n%3A13709880031%2Cp_n_fulfilled_by_amazon%3A21345978031&dc&page={}&rnid=21345970031&ref=sr_pg_2"
while loop:
for i in range(11,-1,-1):
p = pool.apply_async(func=urlAmazon,args=[url,12*page-i])
page+=1
print("End of the code")
This method inicates scanning the pages by one by in this order: 1 to 12, after then, 13 to 24. So i am taken the datas so quickly at the same time.
Buy as i said, i cannot terminate the "main" method which takes the datas.
I believe that your while loop is set to true and is so when you do: 'main(pool,loop)' you execute your main function and it will never stop, since there is no break. You should probably remove the while loop.
You should try to use the map function on your pool. Code should look like this:
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from multiprocessing.pool import Pool
def urlAmazon(paging_method=int()):
url = "https://www.amazon.com.tr/s?i=electronics&bbn=13709880031&rh=n%3A13709880031%2Cp_n_fulfilled_by_amazon%3A21345978031&dc&page={paging_method}&rnid=21345970031&ref=sr_pg_2"
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
req = requests.get(url,headers=headers)
soup = BeautifulSoup(req.content,"lxml")
productList= soup.find("div","s-main-slot s-result-list s-search-results sg-row").findAll("div","sg-col-4-of-24 sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20")
if not soup.find("div","a-section a-text-center s-pagination-container"):
exit()
else:
pass
print(paging_method, len(productList))
for product in productList:
productName = product.find("h2").getText(strip=True)
try:
productPrice = product.find("span","a-offscreen").getText(strip=True)
except:
productPrice = None
with open("products.txt","a",encoding="UTF-8") as f:
f.write(f"{productName}: {productPrice} - {paging_method}\n")
def main():
page_method = list(range(12)) # change this!!!
with Pool(12) as p: # since 12 is the number of processes you want
p.map(urlAmazon, page_method) # i have no idea
print("Buraya ulaştıysak bir obkluk var")
if __name__ == '__main__':
start = time.perf_counter()
main()
finish = time.perf_counter()
print(round(finish-start,2))
i have no idea what the number you want is for the paging method, but make an iterable of the numbers, like a list of all of the numbers you want to put in