I am new to coding and and trying to learn as I go.
I'm trying to create a python script that will grab and print all HEADERS from a list of urls in a txt file.
It seems to be getting there but im stuck in an infinite loop with one of the urls and I have no idea why and for some reason the "-h", or "--help" wont return the usage()
. Any help would be appreciated.
Below is what I have so far:
#!/usr/bin/python
import pycurl
import cStringIO
import sys, getopt
buf = cStringIO.StringIO()
c = pycurl.Curl()
def usage():
print "-h --help, -i --urlist, -o --proxy"
sys.exit()
def main(argv):
iurlist = None
proxy = None
try:
opts, args = getopt.getopt(argv,"hi:o:t",["help", "iurlist=","proxy="])
if not opts:
print "No options supplied"
print "Type -h for help"
sys.exit()
except getopt.GetoptError as err:
print str(err)
usage()
sys.exit(2)
for opt, arg in opts:
if opt == ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-i", "--iurlist"):
iurlist = arg
elif opt in ("-o", "--proxy"):
proxy = arg
else:
assert False, "Unhandeled option"
with open(iurlist) as f:
iurlist = f.readlines()
print iurlist
try:
for i in iurlist:
c.setopt(c.URL, i)
c.setopt(c.PROXY, proxy)
c.setopt(c.HEADER, 1)
c.setopt(c.FOLLOWLOCATION, 1)
c.setopt(c.MAXREDIRS, 30)
c.setopt(c.USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0')
c.setopt(c.TIMEOUT, 8)
c.setopt(c.CONNECTTIMEOUT, 5)
c.setopt(c.NOBODY, 1)
c.setopt(c.PROXY, proxy)
c.setopt(c.WRITEFUNCTION, buf.write)
c.setopt(c.SSL_VERIFYPEER, 0)
c.perform()
print buf.getvalue()
buf.close
except pycurl.error, error:
errno, errstr = error
print 'An error has occurred: ', errstr
if __name__ == "__main__":
main(sys.argv[1:])
This is the latest code:
#!/usr/bin/python
import pycurl
import cStringIO
import sys, getopt
c = pycurl.Curl()
def usage():
print "-h --help, -i --urlist, -o --proxy"
print "Example Usage: cURLdect.py -i urlist.txt -o http://192.168.1.64:8080"
sys.exit()
def main(argv):
iurlist = None
proxy = None
try:
opts, args = getopt.getopt(argv,"hi:o:t",["help", "iurlist=","proxy="])
if not opts:
print "No options supplied"
print "Type -h for help"
sys.exit()
except getopt.GetoptError as err:
print str(err)
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-i", "--iurlist"):
iurlist = arg
elif opt in ("-o", "--proxy"):
proxy = arg
else:
assert False, "Unhandeled option"
with open(iurlist) as f:
iurlist = f.readlines()
print iurlist
try:
for i in iurlist:
buf = cStringIO.StringIO()
c.setopt(c.WRITEFUNCTION, buf.write)
c.setopt(c.PROXY, proxy)
c.setopt(c.HEADER, 1)
c.setopt(c.FOLLOWLOCATION, 1)
c.setopt(c.MAXREDIRS, 300)
c.setopt(c.USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0')
c.setopt(c.TIMEOUT, 8)
c.setopt(c.CONNECTTIMEOUT, 5)
c.setopt(c.NOBODY, 1)
c.setopt(c.SSL_VERIFYPEER, 0)
c.setopt(c.URL, i)
c.perform()
print buf.getvalue()
buf.close()
except pycurl.error, error:
errno, errstr = error
print 'An error has occurred: ', errstr
if __name__ == "__main__":
main(sys.argv[1:])
If you are learning, pycurl may not be the best option. They asume you're familiar with the libcurl library. From http://pycurl.sourceforge.net/:
PycURL is targeted at an advanced developer - if you need dozens of concurrent, fast and reliable connections or any of the sophisticated features listed above then PycURL is for you.
The main drawback of PycURL is that it is a relatively thin layer over libcurl without any of those nice Pythonic class hierarchies. This means it has a somewhat steep learning curve unless you are already familiar with libcurl's C API.
This is how they do a multi-fetch: https://github.com/pycurl/pycurl/blob/master/examples/retriever-multi.py
To fetch the headers a la python, install the requests
library, and just do:
for url in list_of_urls:
r = requests.get(url)
print r.headers
To deal with command line arguments, use the argparser
in the batteries included with python.