I am trying to map website urls to their ip addresses and graph it using networkx. My code looks for all links on a webpage using BeautifulSoup and follows those links. I had trouble assigning attributes directly to the node, so I used networkx set_node_attributes
with two lists containing the urls and the ip addresses. However, one url constantly fails to get an ip address assigned.
My code:
# Gets the links to traverse as well as the IP Address and returns them.
def get_links(url):
response = requests.get(url, stream=True)
ip_address = response.raw._connection.sock.getsockname()
soup = BeautifulSoup(response.text, 'lxml')
return [urljoin(url, a['href']) for a in soup.find_all('a', href=True)], ip_address
# Traverses the urls and
def bfs_traversal(start_url, num_layers, graph):
visited = []
visited_ip_addresses = []
queue = deque([(start_url, 0)])
attrs = {}
while queue:
url, layer = queue.popleft()
if layer > num_layers:
# add our attrs
for url, ip_address in zip(visited, visited_ip_addresses):
attrs[url] = ip_address
# Need to set the node attributes manually see below
nx.set_node_attributes(graph, values=attrs, name='ip_address')
break
if url not in visited:
visited.append(url)
connections, ip_address = get_links(url)
visited_ip_addresses.append(ip_address)
# No longer seems to work hence the above code
graph.add_node(url, ip_address=ip_address)
# throws error graph.add_node(url, {'ip_address':ip_address})
for x in connections:
graph.add_edge(url, x)
queue.append((x, layer + 1))
I then create the graph and assign the position:
scan_layers = 1
entrance = "https://www.youtube.com/"
graph = nx.Graph()
seed = 0
bfs_traversal(entrance, scan_layers, graph)
pos = nx.spring_layout(graph, seed=seed)
# Add pos to the node
for n, p in pos.items():
graph.nodes[n]['pos'] = p
However, since I am having trouble correctly assigning the attributes to the nodes, my next code fails.
ip_addresses = []
for node in graph.nodes(data=True):
print(node, '\n')
ip_address = node[1]['ip_address'][0]
ip_addresses.append(ip_address)
with
('https://www.youtube.com/about/#content', {'pos': array([-0.48220751, -0.4565694 ])})
Traceback (most recent call last):
File "C:\Users\Owner\PycharmProjects\WebsiteMapper\gui.py", line 125, in <module>
gp = GraphPage()
^^^^^^^^^^^
File "C:\Users\Owner\PycharmProjects\WebsiteMapper\gui.py", line 36, in __init__
fig_json = create_network_graph()
^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Owner\PycharmProjects\WebsiteMapper\graphs.py", line 89, in create_network_graph
ip_address = node[1]['ip_address'][0]
~~~~~~~^^^^^^^^^^^^^^
KeyError: 'ip_address'('https://www.youtube.com/about/#content', {'pos': array([-0.48220751, -0.4565694 ])})
Traceback (most recent call last):
File "C:\Users\Owner\PycharmProjects\WebsiteMapper\gui.py", line 125, in <module>
gp = GraphPage()
^^^^^^^^^^^
File "C:\Users\Owner\PycharmProjects\WebsiteMapper\gui.py", line 36, in __init__
fig_json = create_network_graph()
^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Owner\PycharmProjects\WebsiteMapper\graphs.py", line 89, in create_network_graph
ip_address = node[1]['ip_address'][0]
~~~~~~~^^^^^^^^^^^^^^
KeyError: 'ip_address'
We can see an ip address was not assigned for https://www.youtube.com/about/#content'
Why is this? What am I doing wrong?
Thanks for any pointers.
Ok, I see where the problem is, at the end of your bfs function you are indirectly adding additional nodes to the graph:
for x in connections:
graph.add_edge(url, x)
queue.append((x, layer + 1))
You should do it only if you are not yet reached layer limit, so it should look like that:
def bfs_traversal(start_url, num_layers, graph):
visited = []
visited_ip_addresses = []
queue = deque([(start_url, 0)])
attrs = {}
while queue:
url, layer = queue.popleft()
if layer > num_layers:
# add our attrs
for url, ip_address in zip(visited, visited_ip_addresses):
attrs[url] = ip_address
# Need to set the node attributes manually see below
nx.set_node_attributes(graph, values=attrs, name='ip_address')
break
if url not in visited:
visited.append(url)
connections, ip_address = get_links(url)
visited_ip_addresses.append(ip_address)
# No longer seems to work hence the above code
print(url)
graph.add_node(url, ip_address=ip_address)
# throws error graph.add_node(url, {'ip_address':ip_address})
if layer < num_layers:
for x in connections:
graph.add_edge(url, x)
queue.append((x, layer + 1))