from twill.commands import *
from bs4 import BeautifulSoup
from urllib import urlopen
import urllib2
with open('urls.txt') as inf:
urls = (line.strip() for line in inf)
for url in urls:
try:
urllib2.urlopen(url)
except urllib2.HTTPError, e:
print e
site = urlopen(url)
soup = BeautifulSoup(site)
for td in soup.find_all('td', {'class': 'subjectCell'}):
print td.find('a').text
my code opens only a single page from each url of the file, sometimes there are more pages, in that case the pattern for the next pages would be &page=x
here are those pages i'm talking about:
http://www.last.fm/user/TheBladeRunner_/library/tags?tag=long+track http://www.last.fm/user/TheBladeRunner_/library/tags?tag=long+track&page=7
You could read the href attribute from the next_page link and add it to your urls list (yes, you should change your tuple to a list). It could be something like this:
from twill.commands import *
from bs4 import BeautifulSoup
from urllib import urlopen
import urllib2
import urlparse
with open('urls.txt') as inf:
urls = [line.strip() for line in inf]
for url in urls:
try:
urllib2.urlopen(url)
except urllib2.HTTPError, e:
print e
site = urlopen(url)
soup = BeautifulSoup(site)
for td in soup.find_all('td', {'class': 'subjectCell'}):
print td.find('a').text
next_page = soup.find_all('a', {'class': 'nextlink'}):
if next_page:
next_page = next_page[0]
urls.append(urlparse.urljoin(url, next_page['href']))