I've created a script to fetch different newspaper names derived from a search engine when I initiate search using different keywords, as in CMG제약
,DB하이텍
e.t.c. in that pages top right search box.
I also used some customized dates within params to get results from those dates. The script is doing fine as long as I use a single keyword in the search list.
However, when I use multiple keyword in the search list the script only keeps up with the last keyword. This is the list of keywords I would like to use:
keywords = ['CMG제약','DB하이텍','ES큐브','EV첨단소재']
The script is short in size but because of the height of the params, it looks bigger.
I've tried so far with (works as intended as I used single search keyword in the list
):
import requests
import concurrent.futures
from bs4 import BeautifulSoup
from urllib.parse import urljoin
year_list_start = ['2013.01.01','2014.01.02']
year_list_upto = ['2014.01.01','2015.01.01']
base = 'https://search.naver.com/search.naver'
link = 'https://search.naver.com/search.naver'
params = {
'where': 'news',
'sm': 'tab_pge',
'query': '',
'sort': '1',
'photo': '0',
'field': '0',
'pd': '',
'ds': '',
'de': '',
'cluster_rank': '',
'mynews': '0',
'office_type': '0',
'office_section_code': '0',
'news_office_checked': '',
'nso': '',
'start': '',
}
def fetch_content(s,keyword,link,params):
for start_date,date_upto in zip(year_list_start,year_list_upto):
ds = start_date.replace(".","")
de = date_upto.replace(".","")
params['query'] = keyword
params['ds'] = ds
params['de'] = de
params['nso'] = f'so:r,p:from{ds}to{de},a:all'
params['start'] = 1
while True:
res = s.get(link,params=params)
print(res.status_code)
print(res.url)
soup = BeautifulSoup(res.text,"lxml")
if not soup.select_one("ul.list_news .news_area .info_group > a.press"): break
for item in soup.select("ul.list_news .news_area"):
newspaper_name = item.select_one(".info_group > a.press").get_text(strip=True).lstrip("=")
print(newspaper_name)
if soup.select_one("a.btn_next[aria-disabled='true']"): break
next_page = soup.select_one("a.btn_next").get("href")
link = urljoin(base,next_page)
params = None
if __name__ == '__main__':
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
keywords = ['CMG제약']
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
future_to_url = {executor.submit(fetch_content, s, keyword, link, params): keyword for keyword in keywords}
concurrent.futures.as_completed(future_to_url)
How can I make the script work when there are more than one keyword in the search list?
I believe the problem is that variable params
is prematurely being overwritten with data for a subsequent request while a previous request is still being processed. params
needs to be moved to fetch_content
and not passed as an argument:
import requests
import concurrent.futures
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from threading import Lock
year_list_start = ['2013.01.01','2014.01.02']
year_list_upto = ['2014.01.01','2015.01.01']
base = 'https://search.naver.com/search.naver'
link = 'https://search.naver.com/search.naver'
print_lock = Lock()
def fetch_content(f, s,keyword,link):
params = {
'where': 'news',
'sm': 'tab_pge',
'query': '',
'sort': '1',
'photo': '0',
'field': '0',
'pd': '',
'ds': '',
'de': '',
'cluster_rank': '',
'mynews': '0',
'office_type': '0',
'office_section_code': '0',
'news_office_checked': '',
'nso': '',
'start': '',
}
for start_date,date_upto in zip(year_list_start,year_list_upto):
my_params = params
ds = start_date.replace(".","")
de = date_upto.replace(".","")
my_params['query'] = keyword
my_params['ds'] = ds
my_params['de'] = de
my_params['nso'] = f'so:r,p:from{ds}to{de},a:all'
my_params['start'] = 1
while True:
res = s.get(link,params=my_params)
with print_lock:
print(keyword, res.status_code, file=f)
print(keyword, res.url, file=f, flush=True)
soup = BeautifulSoup(res.text,"lxml")
if not soup.select_one("ul.list_news .news_area .info_group > a.press"): break
for item in soup.select("ul.list_news .news_area"):
newspaper_name = item.select_one(".info_group > a.press").get_text(strip=True).lstrip("=")
with print_lock:
print(keyword, newspaper_name, file=f, flush=True)
if soup.select_one("a.btn_next[aria-disabled='true']"): break
next_page = soup.select_one("a.btn_next").get("href")
link = urljoin(base,next_page)
my_params = None
if __name__ == '__main__':
with requests.Session() as s:
with open('output.txt', 'w', encoding='utf8') as f:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
keywords = ['CMG제약','DB하이텍','ES큐브','EV첨단소재']
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
future_to_url = {executor.submit(fetch_content, f, s, keyword, link): keyword for keyword in keywords}
concurrent.futures.as_completed(future_to_url)
Note
You have ...
future_to_url = {executor.submit(fetch_content, s, keyword, link): keyword for keyword in keywords}
concurrent.futures.as_completed(future_to_url)
... where concurrent.futures.as_completed(future_to_url)
returns an iterator that you are not iterating. You might as well just replace the above two lines with:
for keyword in keywords:
executor.submit(fetch_content, s, keyword, link)
Or you can make keyword the final argument to fetch_content
...
def fetch_content(s, link, keyword):
... and then
from functools import partial
executor.map(partial(fetch_content, s, link), keywords)
Update 2
Here is a modified version where fetch_content
returns a list of newspaper names it finds rather than printing them out (the main thread can then print the lists out) and the other print statements are commented out to reduce the extra "noise" so that the results cam be included here. I have changed the order of the arguments in case you wanted to use map
instead:
import requests
import concurrent.futures
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from functools import partial
year_list_start = ['2013.01.01','2014.01.02']
year_list_upto = ['2014.01.01','2015.01.01']
base = 'https://search.naver.com/search.naver'
link = 'https://search.naver.com/search.naver'
def fetch_content(s, link, keyword):
params = {
'where': 'news',
'sm': 'tab_pge',
'query': '',
'sort': '1',
'photo': '0',
'field': '0',
'pd': '',
'ds': '',
'de': '',
'cluster_rank': '',
'mynews': '0',
'office_type': '0',
'office_section_code': '0',
'news_office_checked': '',
'nso': '',
'start': '',
}
newspaper_names = []
for start_date,date_upto in zip(year_list_start,year_list_upto):
my_params = params
ds = start_date.replace(".","")
de = date_upto.replace(".","")
my_params['query'] = keyword
my_params['ds'] = ds
my_params['de'] = de
my_params['nso'] = f'so:r,p:from{ds}to{de},a:all'
my_params['start'] = 1
while True:
res = s.get(link,params=my_params)
#print(res.status_code, flush=True)
#print(res.url, flush=True)
soup = BeautifulSoup(res.text,"lxml")
if not soup.select_one("ul.list_news .news_area .info_group > a.press"): break
for item in soup.select("ul.list_news .news_area"):
newspaper_name = item.select_one(".info_group > a.press").get_text(strip=True).lstrip("=")
newspaper_names.append(newspaper_name)
if soup.select_one("a.btn_next[aria-disabled='true']"): break
next_page = soup.select_one("a.btn_next").get("href")
link = urljoin(base,next_page)
my_params = None
return newspaper_names
if __name__ == '__main__':
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
keywords = ['CMG제약','DB하이텍','ES큐브','EV첨단소재']
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
future_to_url = {executor.submit(fetch_content, s, link, keyword): keyword for keyword in keywords}
for future in concurrent.futures.as_completed(future_to_url):
print('keyword = ', future_to_url[future], 'newspaper names =', future.result())
"""
from functools import partial
results = executor.map(partial(fetch_content, s, link), keywords)
for idx, result in enumerate(results):
print('keyword = ', keywords[idx], 'newspaper names =', result)
"""
Prints:
keyword = DB하이텍 newspaper names = []
keyword = ES큐브 newspaper names = ['국제신문', '스포츠월드', '스포츠조선', '이뉴스투데이', '중앙SUNDAY', '중앙SUNDAY', '매일경제', '디지털데일리', '전자신문', '머니투데이', '한경비즈니스', '한경비즈니스', '동아일보', '뉴시스', '데일리안', '매일경제', '한경비즈니스', '한경비즈니스', '동아일보', '뉴시스', '데일리안', '매일경제']
keyword = EV첨단소재 newspaper names = ['머니S', '아주경제', 'EBN', '오토타임즈', '머니S', '서울경제', '뉴시스', '파이낸셜뉴스', '연합뉴스', '연합뉴스', 'EBN', '뉴스핌', '포브스코리아', 'EBN', '시민일보', '매일경제', '세계일보', 'TV리포트', '전기신문', '뉴시스', '기호일보', '스포츠월드', 'OSEN', '뉴시스', '경북매일신문', '파이낸셜뉴스', '이투데이', '뉴시스', '헤럴드경제', '헤럴드POP', '조선비즈', 'EBN', '아주경제', '뉴스1', '아시아경제', '헤럴드경제', '전자신문', '뉴시스', '뉴시스', '전기신문', '전자신문', '오토타임즈', '연합뉴스', '에너지경제', '서울경제', 'EBN', '서울경제', '파이낸셜뉴스', '전자신문', '오토타임즈', '연합뉴스', '에너지경제', '서울경제', 'EBN', '서울경제', '파이낸셜뉴스']
keyword = CMG제약 newspaper names = ['국민일보', '국민일보', '메디컬투데이', '한국경제', '서울경제', '매일경제', '시민일보', '아시아경제', '데일리안', '조선비즈', '메디파나뉴스', '매일경제', 'TBS', '매일경제', 'MBN', '아시아경제', 'KBS', '뉴스토마토', '연합뉴스', '뉴스1', '국민일보', '뉴시스', '국민일보', '뉴스토마토', '아시아투데이', '청년의사', '메디파나뉴스', '이데일리', '메디컬투데이', '한국경제', '아시아경제', '이투데이', '머니투데이', '뉴스토마토', '연합뉴스', '약업신문', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '이데일리', '뉴스토마토', '머니투데이', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '머니투데이', '머니투데이', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '이투데이', '한국경제', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '헤럴드POP', '뉴스토마토', '한국경제', '서울경제', '매일경제', '뉴스토마토', '서울파이낸스', '뉴스토마토', '이데일리', '헤럴드POP', '뉴스토마토', '뉴스토마토', '뉴스토마토', '머니투데이', '뉴스토마토', '한국경제', '이투데이', '파이낸셜뉴스', '매일경제', '뉴시스', '뉴스토마토', '뉴스토마토', '이투데이', 'EBN', 'NSP통신', '이투데이', '아주경제', '한국경제', '뉴스핌', '뉴스토마토', '이데일리', '헤럴드POP', '머니투데이', '머니투데이', '아시아경제', 'NSP통신', '서울파이낸스', '아시아경제', '뉴스토마토', '이데일리', '이투데이', '뉴스토마토', '이데일리', '뉴스핌', '머니투데이', '헤럴드POP', '이데일리', '이투데이', '세계일보', '뉴스토마토', '서울파이낸스', '머니투데이', '이데일리', '이투데이', '컨슈머타임스', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '약업신문', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '약업신문', '약업신문', '뉴시스', '연합뉴스', '뉴스토마토', '약업신문', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '뉴스토마토', '약업신문', '약업신문', '한국경제', '서울경제', '이데일리', '이투데이', '한국경제', '매일경제', '이데일리', '서울경제', '매일경제', '이데일리', '서울경제', '이투데이', '파이낸셜뉴스', '조선비즈', '뉴스핌', '한국경제', '머니투데이', '파이낸셜뉴스', '매일경제', '파이낸셜뉴스', '파이낸셜뉴스', '연합뉴스', '데일리팜', '데일리팜', '조선비즈', '이투데이', '한국경제', 'MTN', '서울경제', '뉴스토마토', '메디파나뉴스', '조선비즈', '파이낸셜뉴스', '한국경제', '아시아경제', '이투데이', '연합뉴스', '한국경제', '뉴스핌', '이데일리', '머니투데이', '매일경제', '약업신문', '뉴스토마토', '메디파나뉴스', '파이낸셜뉴스', '파이낸셜뉴스', '한국경제', '이투데이', '머니투데이', '연합뉴스', '이투데이', '매일경제', '매일경제', '뉴스토마토', '서울경제', '이투데이', '아주경제', '이데일리', '한국경제', '헤럴드POP', '매일경제', '뉴스핌', '머니투데이', '머니투데이', '서울파이낸스', '뉴스토마토', '헤럴드POP', '뉴스토마토', '한국경제', '한국경제', '서울경제', '한국경제', '이데일리', '헤럴드POP', '조선비즈', '아주경제', '서울경제', '매일경제', '뉴시스', '뉴스토마토', '뉴스핌', '연합뉴스', '파이낸셜뉴스', '매일경제', '이투데이', '아시아경제', '매일경제', '이투데이', '아시아경제']
Note
If you un-comment out the other print statements and you print results as they are completed (i.e. by using method as_completed
), then the printout of the newspaper lists will be interspersed among the other print lines and may be hard to see. In that case you may wish to use the map
method I have included but have commented out so that the newspaper lists are only printed out after all the results have been returned and all debugging print statements have been issued.