Search code examples
pythonmultithreadinghostnameresolve

Python socket.gethostbyname_ex() multithread fails


I programmed a script that should resolve multiple hostnames into ip addresses using Multithreading.

However, it fails and freezes at some random point. How can this be solved?

num_threads = 100
conn = pymysql.connect(host='xx.xx.xx.xx', unix_socket='/tmp/mysql.sock', user='user', passwd='pw', db='database')
cur = conn.cursor()
def mexec(befehl):
    cur = conn.cursor()
    cur.execute(befehl)

websites = ['facebook.com','facebook.org' ... ... ... ...] \#10.000 websites in array
queue = Queue()
def getips(i, q):
    while True:
        #--resolve IP--
        try:
            result = socket.gethostbyname_ex(site)
            print(result)
            mexec("UPDATE sites2block SET ip='"+result+"', updated='yes' ") #puts site in mysqldb
        except (socket.gaierror):
            print("no ip")
            mexec("UPDATE sites2block SET ip='no ip', updated='yes',")
        q.task_done()
#Spawn thread pool
for i in range(num_threads):
    worker = Thread(target=getips, args=(i, queue))
    worker.setDaemon(True)
    worker.start()
#Place work in queue
for site in websites:
    queue.put(site)
#Wait until worker threads are done to exit
queue.join()

Solution

  • You could use a sentinel value to signal threads that there is no work and join the threads instead of queue.task_done() and queue.join():

    #!/usr/bin/env python
    import socket
    from Queue import Queue
    from threading import Thread
    
    def getips(queue):
        for site in iter(queue.get, None):
            try: # resolve hostname
                result = socket.gethostbyname_ex(site)
            except IOError, e:
                print("error %s reason: %s" % (site, e))
            else:
                print("done %s %s" % (site, result))
    
    def main():
        websites = "youtube google non-existent.example facebook yahoo live".split()
        websites = [name+'.com' for name in websites]
    
        # Spawn thread pool
        queue = Queue()
        threads = [Thread(target=getips, args=(queue,)) for _ in range(20)]
        for t in threads:
            t.daemon = True
            t.start()
    
        # Place work in queue
        for site in websites: queue.put(site)
        # Put sentinel to signal the end
        for _ in threads: queue.put(None)
        # Wait for completion
        for t in threads: t.join()
    
    main()
    

    gethostbyname_ex() function is obsolete. To support both IPv4/v6 addresses you could use socket.getaddrinfo() instead.