I want to convert complex file output into a simpler version, but I can't seem to get the regex right.
I have tried using regex and pandas to convert this weird formatted code to something nicer but the code I'm using just gives headers and not data. The data looks like this (be warned it's horrible), the sample file is here: https://wetransfer.com/downloads/d5c0588d5dd08d0e67ddf854d4a3c3bb20240906142948/0d9147.
Weather Summary for summer 2024
Rainfall | Air Temperature | Sunshine | Wind | Grass Temp | ||||||
---|---|---|---|---|---|---|---|---|---|---|
Most in Day | Means of | Extreme Temperature | Most in Day | Lowest | ||||||
Total | ______ | _____ | ______ | _____ | _____ | _____ | Total | _____ | Grass date | |
Station | Amount | Date | Max. | Min. | Max | Min | Date | min | ||
___________________ | _______ | ______ | _____ | ______ | _____ | _____ | _____ | ________ | _____ | ______________ |
station 1 | 121.5 | 13.4 | 29 Ju | 19.7 | 10.2 | 26.6 | 4.5 | -- | -- | -0.7 19 Ju |
___________________ | _______ | ______ | _____ | ______ | _____ | _____ | _____ | ________ | _____ | ______________ |
station 2 | 235.9 | 21.1 | 26 Ag | 16.6 | 11.9 | 21.9 | 6.5 | -- | -- | -1.3 11 Ju |
___________________ | _______ | ______ | _____ | ______ | _____ | _____ | _____ | ________ | _____ | ______________ |
station 3 | 135.7 | 13.6 | 29 Ju | 19.3 | 10.1 | 24.0 | 3.5 | -- | -- | -0.7 12 Ju |
___________________ | _______ | ______ | _____ | ______ | _____ | _____ | _____ | ________ | _____ | ______________ |
I want to get it in simple csv format but I can't figure it out.
Screenshot of what it looks like is here:
My latest code:
import re
import csv
#Input and output file paths
input_file = 'table_example.txt
output_file = 'weather_summary.csv
# Define a function to parse the file and extract the table data
def parse_weather_data(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
# List to hold the processed rows
data = []
# Regex to match valid data rows (ignoring separators and headers)
pattern = re.compile(r'\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|')
for line in lines:
match = pattern.match(line)
if match:
# Extract matched groups and clean the values
row = [group.strip() for group in match.groups()]
data.append(row)
return data
# Define a function to write data to CSV
def write_to_csv(data, output_file):
# Column headers
headers = [
"Station", "Rainfall Total", "Most in Day (Amount)", "Most
in Day (Date)",
"Max Air Temp", "Min Air Temp", "Mean Air Temp", "Max
Temp", "Max Temp Date",
"Min Temp", "Min Temp Date", "Sunshine Total", "Most in Day
Sunshine (Amount)",
"Most in Day Sunshine (Date)", "Max Wind Gust", "Max Wind
Gust Date",
"Grass Min Temp", "Grass Min Temp Date"
]
with open(output_file, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(headers) # Write the headers
writer.writerows(data) # Write the data rows`
# Parse the weather data from the input file
weather_data = parse_weather_data(input_file)
# Write the parsed data to CSV
write_to_csv(weather_data, output_file)
EDIT Updated code which gives the dataframe but it's still messy:
data = pd.read_csv('example_table.txt', sep="|", header=None, skiprows = 8)
data.columns = [
"Station", "Rainfall Total", "Most in Day (Amount)", "Most in Day (Date)",
"Max Air Temp", "Min Air Temp", "Mean Air Temp", "Max Temp", "Max Temp Date",
"Min Temp", "Min Temp Date", "Sunshine Total", "Most in Day Sunshine (Amount)",
"Most in Day Sunshine (Date)", "Max Wind Gust", "Max Wind Gust Date",
"Grass Min Temp", "Grass Min Temp Date"
]
data
As your data seem appear to be fixed width and you tagged your question with pandas
, you could use read_fwf
:
import pandas as pd
infile = 'table_example.txt'
outfile = 'weather_summary.csv'
colspecs = [
(2, 21), (22, 29), (30, 36), (37, 42), (43, 49), (50, 55), (56, 63),
(64, 69), (70, 75), (76, 81), (82, 87), (88, 96), (97, 103),
(104, 109), (110, 115), (116, 123), (124, 129), (130, 138)
]
names = [
"Station", "Rainfall Total", "Most in Day (Amount)", "Most in Day (Date)",
"Max Air Temp", "Min Air Temp", "Mean Air Temp", "Max Temp", "Max Temp Date",
"Min Temp", "Min Temp Date", "Sunshine Total", "Most in Day Sunshine (Amount)",
"Most in Day Sunshine (Date)", "Max Wind Gust", "Max Wind Gust Date",
"Grass Min Temp", "Grass Min Temp Date"
]
df = pd.read_fwf(infile, skiprows=8, colspecs=colspecs, names=names)
df = df[~df.Station.fillna('').str.startswith('_')]
df.to_csv(outfile, index=False)