Search code examples
pythonjsonstringsplitpython-re

How can a number range and value be extracted from this complicated string using Python?


I have a complicated string that includes a kilometer range and a fee for users that fall into that range. Ideally, I would like to transform the string into something that I could use to easily assign fees to users. I've taken the steps below to extract a list from the string, but the kilometer range is still not usable for classifying users:

import string
import unicodedata
import re
str = '{"row1":{"from":"0","to":"500","fee":"23100    "},"row2":{"from":"500","to":"1000","fee":"24100    "},"row3":{"from":"1000","to":"1500","fee":"25200    "},"row4":{"from":"1500","to":"2000","fee":"26200    "},"row5":{"from":"2000","to":"2500","fee":"27200    "},"row6":{"from":"2500","to":"3000","fee":"28300    "},"row7":{"from":"3000","to":"3500","fee":"29300    "},"row8":{"from":"3500","to":"4000","fee":"30400    "},"row9":{"from":"4000","to":"4500","fee":"31400    "},"row10":{"from":"4500","to":"5000","fee":"32400    "},"row11":{"from":"5000","to":"5500","fee":"33500    "},"row12":{"from":"5500","to":"6000","fee":"34600    "},"row13":{"from":"6000","to":"6500","fee":"35500  "},"row14":{"from":"6500","to":"7000","fee":"36600    "},"row15":{"from":"7000","to":"7500","fee":"37700    "},"row16":{"from":"7500","to":"8000","fee":"38600    "},"row17":{"from":"8000","to":"8500","fee":"39700    "},"row18":{"from":"8500","to":"9000","fee":"40300    "},"row19":{"from":"9000","to":"9500","fee":"41400    "},"row20":{"from":"9500","to":"10000","fee":"42700    "},"row21":{"from":"10000","to":"10500","fee":"43500    "},"row22":{"from":"10500","to":"11000","fee":"44500    "},"row23":{"from":"11000","to":"11500","fee":"45600    "},"row24":{"from":"11500","to":"12000","fee":"46600    "},"row25":{"from":"12000","to":"12500","fee":"47700    "},"row26":{"from":"12500","to":"13000","fee":"48700    "},"row27":{"from":"13000","to":"13500","fee":"49700    "},"row28":{"from":"13500","to":"14000","fee":"50800    "},"row29":{"from":"14000","to":"14500","fee":"51900    "},"row30":{"from":"14500","to":"15000","fee":"52800    "},"row31":{"from":"15000","to":"15500","fee":"52800    "},"row32":{"from":"15500","to":"16000","fee":"52800    "},"row33":{"from":"16000","to":"70000","fee":"52800    "'
str1 = unicodedata.normalize("NFKD", str)
str2 = str1.translate({ord(c): None for c in string.whitespace})
s1 = str2.replace('{', "")
s2 = s1.replace('"', ' ')
s3 = s2.split(sep='},')

for i in range(len(s3)):
    ftm = re.search(r'from : (\d+) , to : (\d+)', s3[i])
    fm = re.search(r'fee : (\d+)', s3[i])
    if ftm and fm:
        vfrom = int(ftm.group(1))
        vto = int(ftm.group(2))
        vfee= int(fm.group(1))
        print(f"({vfrom}, {vto}), {vfee}")

I would like it to be instead transformed into something that extracts the lower and upper bounds of the range separately next to the fee, where I could later export into Excel and check which range the user falls between.


Solution

  • Your input data is almost valid JSON except that it's missing 2 trailing right-braces. If you fix that in code (albeit that it should be fixed at origin) then you can simply do this:

    import json
    
    _str = '{"row1":{"from":"0","to":"500","fee":"23100    "},"row2":{"from":"500","to":"1000","fee":"24100    "},"row3":{"from":"1000","to":"1500","fee":"25200    "},"row4":{"from":"1500","to":"2000","fee":"26200    "},"row5":{"from":"2000","to":"2500","fee":"27200    "},"row6":{"from":"2500","to":"3000","fee":"28300    "},"row7":{"from":"3000","to":"3500","fee":"29300    "},"row8":{"from":"3500","to":"4000","fee":"30400    "},"row9":{"from":"4000","to":"4500","fee":"31400    "},"row10":{"from":"4500","to":"5000","fee":"32400    "},"row11":{"from":"5000","to":"5500","fee":"33500    "},"row12":{"from":"5500","to":"6000","fee":"34600    "},"row13":{"from":"6000","to":"6500","fee":"35500  "},"row14":{"from":"6500","to":"7000","fee":"36600    "},"row15":{"from":"7000","to":"7500","fee":"37700    "},"row16":{"from":"7500","to":"8000","fee":"38600    "},"row17":{"from":"8000","to":"8500","fee":"39700    "},"row18":{"from":"8500","to":"9000","fee":"40300    "},"row19":{"from":"9000","to":"9500","fee":"41400    "},"row20":{"from":"9500","to":"10000","fee":"42700    "},"row21":{"from":"10000","to":"10500","fee":"43500    "},"row22":{"from":"10500","to":"11000","fee":"44500    "},"row23":{"from":"11000","to":"11500","fee":"45600    "},"row24":{"from":"11500","to":"12000","fee":"46600    "},"row25":{"from":"12000","to":"12500","fee":"47700    "},"row26":{"from":"12500","to":"13000","fee":"48700    "},"row27":{"from":"13000","to":"13500","fee":"49700    "},"row28":{"from":"13500","to":"14000","fee":"50800    "},"row29":{"from":"14000","to":"14500","fee":"51900    "},"row30":{"from":"14500","to":"15000","fee":"52800    "},"row31":{"from":"15000","to":"15500","fee":"52800    "},"row32":{"from":"15500","to":"16000","fee":"52800    "},"row33":{"from":"16000","to":"70000","fee":"52800    "'
    
    for v in json.loads(_str + "}}").values():
        row = [v[k].replace(" ", "").strip() for k in ("from", "to", "fee")]
        print(", ".join(row))
    

    Output (partial):

    0, 500, 23100
    500, 1000, 24100
    1000, 1500, 25200
    1500, 2000, 26200
    2000, 2500, 27200
    2500, 3000, 28300
    3000, 3500, 29300
    3500, 4000, 30400
    4000, 4500, 31400
    4500, 5000, 32400
    5000, 5500, 33500
    5500, 6000, 34600
    6000, 6500, 35500
    6500, 7000, 36600