https://repo1.maven.org/maven2/
This folder contains lots of subdirectories and files. I want to download only maven-metadata.xml
files using Python. I tried that answer but it doesn't traverse subdirectories recursively.
i would recommend using beautiful soup as well.. you could do something like this, whereas my test, if it's a directory is very, very simple (just, if the link ands with a '/'):
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
import requests
def isDirectory(url):
if(url.endswith('/')):
return True
else:
return False
def findLinks(url):
page = requests.get(url).content
bsObj = BeautifulSoup(page, 'html.parser')
maybe_directories = bsObj.findAll('a', href=True)
for link in maybe_directories:
print(link['href'])
print(isDirectory(link['href']))
if(isDirectory(link['href'])):
newUrl = url + link['href']
findLinks(newUrl) #recursion happening here
else:
if(link['href'].endswith('maven-metadata.xml')):
print("GOTCHA!") #now safe and download
startUrl = "https://repo1.maven.org/maven2/"
findLinks(startUrl)