Search code examples
pythonnetworkx

Why am I failing to get the IP Address of one website url using networkx and requests?


I am trying to map website urls to their ip addresses and graph it using networkx. My code looks for all links on a webpage using BeautifulSoup and follows those links. I had trouble assigning attributes directly to the node, so I used networkx set_node_attributes with two lists containing the urls and the ip addresses. However, one url constantly fails to get an ip address assigned.

My code:

# Gets the links to traverse as well as the IP Address and returns them.
def get_links(url):
    response = requests.get(url, stream=True)
    ip_address = response.raw._connection.sock.getsockname()
    soup = BeautifulSoup(response.text, 'lxml')
    return [urljoin(url, a['href']) for a in soup.find_all('a', href=True)], ip_address
# Traverses the urls and 
def bfs_traversal(start_url, num_layers, graph):
    visited = []
    visited_ip_addresses = []
    queue = deque([(start_url, 0)])

    attrs = {}
    while queue:
        url, layer = queue.popleft()
        if layer > num_layers:
            # add our attrs
            for url, ip_address in zip(visited, visited_ip_addresses):
                attrs[url] = ip_address

            # Need to set the node attributes manually see below
            nx.set_node_attributes(graph, values=attrs, name='ip_address')

            break

        if url not in visited:
            visited.append(url)
            connections, ip_address = get_links(url)
            visited_ip_addresses.append(ip_address)

            # No longer seems to work hence the above code
            graph.add_node(url, ip_address=ip_address)
            # throws error graph.add_node(url, {'ip_address':ip_address})


            for x in connections:
                graph.add_edge(url, x)
                queue.append((x, layer + 1))

I then create the graph and assign the position:

scan_layers = 1
entrance = "https://www.youtube.com/"

graph = nx.Graph()
seed = 0

bfs_traversal(entrance, scan_layers, graph)
pos = nx.spring_layout(graph, seed=seed)

# Add pos to the node
for n, p in pos.items():
    graph.nodes[n]['pos'] = p

However, since I am having trouble correctly assigning the attributes to the nodes, my next code fails.

    ip_addresses = []
    for node in graph.nodes(data=True):
        print(node,  '\n')
        ip_address = node[1]['ip_address'][0]
        ip_addresses.append(ip_address)

with

('https://www.youtube.com/about/#content', {'pos': array([-0.48220751, -0.4565694 ])})

Traceback (most recent call last):
  File "C:\Users\Owner\PycharmProjects\WebsiteMapper\gui.py", line 125, in <module>
    gp = GraphPage()
         ^^^^^^^^^^^
  File "C:\Users\Owner\PycharmProjects\WebsiteMapper\gui.py", line 36, in __init__
    fig_json = create_network_graph()
               ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Owner\PycharmProjects\WebsiteMapper\graphs.py", line 89, in create_network_graph
    ip_address = node[1]['ip_address'][0]
                 ~~~~~~~^^^^^^^^^^^^^^
KeyError: 'ip_address'('https://www.youtube.com/about/#content', {'pos': array([-0.48220751, -0.4565694 ])})

Traceback (most recent call last):
  File "C:\Users\Owner\PycharmProjects\WebsiteMapper\gui.py", line 125, in <module>
    gp = GraphPage()
         ^^^^^^^^^^^
  File "C:\Users\Owner\PycharmProjects\WebsiteMapper\gui.py", line 36, in __init__
    fig_json = create_network_graph()
               ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Owner\PycharmProjects\WebsiteMapper\graphs.py", line 89, in create_network_graph
    ip_address = node[1]['ip_address'][0]
                 ~~~~~~~^^^^^^^^^^^^^^
KeyError: 'ip_address'

We can see an ip address was not assigned for https://www.youtube.com/about/#content' Why is this? What am I doing wrong?

Thanks for any pointers.


Solution

  • Ok, I see where the problem is, at the end of your bfs function you are indirectly adding additional nodes to the graph:

    for x in connections:
        graph.add_edge(url, x)
        queue.append((x, layer + 1))
    

    You should do it only if you are not yet reached layer limit, so it should look like that:

    def bfs_traversal(start_url, num_layers, graph):
    visited = []
    visited_ip_addresses = []
    queue = deque([(start_url, 0)])
    
    attrs = {}
    while queue:
        url, layer = queue.popleft()
        if layer > num_layers:
            # add our attrs
            for url, ip_address in zip(visited, visited_ip_addresses):
                attrs[url] = ip_address
    
            # Need to set the node attributes manually see below
            nx.set_node_attributes(graph, values=attrs, name='ip_address')
    
            break
    
        if url not in visited:
            visited.append(url)
            connections, ip_address = get_links(url)
            visited_ip_addresses.append(ip_address)
    
            # No longer seems to work hence the above code
            print(url)
            graph.add_node(url, ip_address=ip_address)
            # throws error graph.add_node(url, {'ip_address':ip_address})
    
    
            if layer < num_layers:
                for x in connections:
                    graph.add_edge(url, x)
                    queue.append((x, layer + 1))