I'm currently developing a web crawler that works through a list of urls I have stored in a queue file, I need my Spider to scrape all words from these url pages before it moves onto the next link in the queue, I need a point in the right direction for setting it up so that web scraper compares to my common.txt to make sure the word isn't in there and if it isn't already in the list before adding it etc.
I had tried something like this with get_keywords in my spider.py but it isn't doing anything I may be missing something simple as I've been coding all day but anyway here is my code
Spider.py
from Gen_info import *
class Spider:
project_name = ''
queue_file = ''
crawled_file = ''
keyword_file = ''
queue = set()
crawled = set()
def __init__(self, project_name):
Spider.project_name = project_name
Spider.queue_file = Spider.project_name + '/Chrome_Hist.csv'
Spider.crawled_file = Spider.project_name + '/CrawledUrls.txt'
self.boot()
#self.crawl_page('First spider', Spider.queue)
# Creates directory and files for project on first run and starts the spider
@staticmethod
def boot():
create_project_dir(Spider.project_name)
create_files(Spider.project_name)
Spider.queue = file_to_set(Spider.queue_file)
Spider.crawled = file_to_set(Spider.crawled_file)
# Updates user display, fills queue and updates files
@staticmethod
def crawl_page(thread_name, page_url):
if page_url not in Spider.crawled:
print(thread_name + ' now crawling ' + page_url)
print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled)))
Spider.queue.remove(page_url)
Spider.crawled.add(page_url)
Spider.update_files()
@staticmethod
def update_files():
set_to_file(Spider.queue, Spider.queue_file)
set_to_file(Spider.crawled, Spider.crawled_file)
@staticmethod
def get_keywords(Page_words):
common = open("Common_words.txt").read().split('\n')
word_dict = {}
word_list = Page_words.lower().split()
for word in word_list:
if word not in common and word.isalnum():
if word not in word_dict:
word_dict[word] = 1
if word in word_dict:
word_dict[word] += 1
main.py
import threading
from Queue import Queue
from Spider import Spider
from Gen_info import *
import urllib2
from bs4 import BeautifulSoup
from shutil import copyfile
import os
PROJECT_NAME = 'History Forensics'
QUEUE_FILE = PROJECT_NAME + '/Chrome_Hist.csv'
CRAWLED_FILE = PROJECT_NAME + '/CrawledUrls.txt'
NUMBER_OF_THREADS = 2
Queue = Queue()
Spider(PROJECT_NAME)
keywords = ''
src = 'C:\Users\Lewis Collins\Python Project\ChromeDBs\Chrome_Hist.csv'
dst = PROJECT_NAME
path = 'C:\Users\Lewis Collins\Python Project\ChromeDBs\Chrome_Hist.csv'
# Create worker threads (will die when main exits)
def create_workers():
for _ in range(NUMBER_OF_THREADS):
t = threading.Thread(target=work)
t.daemon = True
t.start()
# Do the next job in the queue
def work():
while True:
url = Queue.get()
Spider.crawl_page(threading.current_thread().name, url)
Queue.task_done()
# Each queued link is a new job
def create_jobs():
for link in file_to_set(QUEUE_FILE):
Queue.put(link)
Queue.join()
crawl()
# Check if there are items in the queue, if so crawl them
def crawl():
queued_links = file_to_set(QUEUE_FILE)
if len(queued_links) > 0:
print(str(len(queued_links)) + ' links in the queue')
create_jobs()
def get_keywords():
common_words = open('File_Storage/common.txt', 'r').readlines()
keywords=open(PROJECT_NAME + '/keywords.txt', 'r').read().split('\n')
f = open(PROJECT_NAME + '/keywords.txt', 'a')
urls = file_to_set(QUEUE_FILE)
Hist_queue = urls
for i in Hist_queue:
html_content = urllib2.urlopen(i).read()
soup = BeautifulSoup(html_content)
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
(text.encode('utf-8'))
visible_text = soup.getText()
words = visible_text.split(' ')
for word in words:
if word not in common_words and word not in keywords and word.isalnum():
f.write(word + '\n')
keywords.append(word)
else:
continue
#copyfile(src, dst)
#
# os.remove(path)
create_workers()
get_keywords()
crawl()
Any questions about how it works fire away or any other code you may need to see
thanks in advance everyone
def get_keywords():
common_words = open('File_Storage/common.txt', 'r').readlines()
keywords=open(PROJECT_NAME + '/keywords.txt', 'r').read().split('\n')
f = open(PROJECT_NAME + '/keywords.txt', 'a')
urls = file_to_set(QUEUE_FILE)
Hist_queue = urls
for i in Hist_queue:
html_content = urllib2.urlopen(i).read()
soup = BeautifulSoup(html_content)
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
(text.encode('utf-8'))
visible_text = soup.getText()
words = visible_text.split(' ')
for word in words:
if word not in common_words and word not in keywords and word.isalnum():
f.write(word + '\n')
keywords.append(word)
else:
continue