I have this final main.py that combines every function I wrote separately, but I can't make it work, it actually returns the Success at the end but it actually does nothing nor in my local folders or MongoDB. The function is this one:
def gw2_etl(url):
def log_scrape(url):
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
response = requests.get(url=url, headers=HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
data = soup.find_all('script')[8]
dataString = data.text.rstrip()
logData = re.findall(r'{.*}', dataString)
try:
urlLines = url.split('/')
if len(urlLines) < 5:
bossName = urlLines[3]
elif len(urlLines) == 5:
bossName = urlLines[4]
except Exception as e:
return 'Error' + str(e)
tag = bossName.split('_')
bossTag = tag[1]
try:
# Wing_1
if bossTag == 'vg':
pathName = 'ETL\EXTRACT_00\Web Scraping\Boss_data\Wing_1\Valley_Guardian'
with open(f'{pathName}\{bossName}.json', 'w') as f:
for line in logData:
jsonFile = f.write(line)
return jsonFile
return log_scrape()
def store_data(jsonFile):
with open(jsonFile) as f:
data = json.load(f)
sp = jsonFile.split('\\')
posSp = sp[-1]
bossTag = posSp.split('_')
nameTag = bossTag[1]
if len(bossTag) > 2:
nameTag = bossTag[1]
elif len(bossTag) == 2:
tagSplit = nameTag.split('.')
nameTag = tagSplit[0]
# Players Data:
player_group = []
player_acc = []
player_names = []
player_classes = []
for player in data['players']:
player_group.append(player['group'])
player_acc.append(player['acc'])
player_names.append(player['name'])
player_classes.append(player['profession'])
try:
# Wing-1
if nameTag == 'vg':
# Create lists:
player_dps1 = []
player_dps2 = []
player_dps3 = []
# Phase_1
phase1 = data['phases'][1]['dpsStats']
phase1_time_raw = data['phases'][1]['duration']
phase1_time = round(phase1_time_raw/1000,1)
for dps in phase1:
dps1_raw = dps[0]
player_dps1.append(round(dps1_raw/phase1_time,2))
# Phase_2
phase2 = data['phases'][6]['dpsStats']
phase2_time_raw = data['phases'][6]['duration']
phase2_time = round(phase2_time_raw/1000,1)
for dps in phase2:
dps2_raw = dps[0]
player_dps2.append(round(dps2_raw/phase2_time,2))
# Phase_3
phase3 = data['phases'][12]['dpsStats']
phase3_time_raw = data['phases'][12]['duration']
phase3_time = round(phase3_time_raw/1000,1)
for dps in phase3:
dps3_raw = dps[0]
player_dps3.append(round(dps3_raw/phase3_time,2))
stats_dict = {
'players':{
'group': player_group,
'account': player_acc,
'names': player_names,
'profession': player_classes,
'phase_1_dps': player_dps1,
'phase_2_dps': player_dps2,
'phase_3_dps': player_dps3
}
}
df = pd.DataFrame(stats_dict['players'], columns=['group','account','names','profession','phase_1_dps','phase_2_dps','phase_3_dps'])
return stats_dict
except Exception as e:
print('Error' + str(e))
sys.exit()
# JSON generator (MongoDB)
pathName = 'ETL\TRANSFORM_01\Players_info'
jsonString = json.dumps(stats_dict)
with open(f"{pathName}\{nameTag}_player_stats.json", 'w') as f:
f.write(jsonString)
# CSV generator (MySQL, PostgreSQL)
df.to_csv(f"{pathName}\{nameTag}_player_stats.csv",index=True)
return store_data()
def mongo_connect(stats_dict):
try:
client = pymongo.MongoClient('mongodb://localhost:27017/')
except Exception as e:
print('Connection could not be done' + str(e))
sys.exit()
db = client['GW2_SRS']
collection = db['players_info']
mongo_insert = collection.insert_one(stats_dict)
return mongo_connect()
return 'Success!'
pass
My goal is that, when I call gw2_etl(), it runs every process inside (log_scrape, store_data and mongo_connect) and returns the Success message at the end. I'm probably doing it wrong since it neither runs anything nor send an error message.
For the mongo connection, I need to return the stats_dict, since it is the JSON file that I want to upload there, csv file is just for local storage.
I actually got some bosses out since the code it's actually pretty long.
If you have any hint or clue about how could I make this work, I would be incredibly grateful.
You still need to call all of those functions separately from within the gw2_etl()
before returning from the function. Defining functions inside another just means you can't access them outside of the outer function. So before the return statement add
log_scraper(url)
store_data(json_file)
mongo_connect(stats_dict)
and continue from there. You'll notice that you need to carry over some variables to invoke the functions with the correct arguments, but I left that part for you to figure out.