Search code examples
pythonhtmlweb-scrapingpython-requestspre

Parsing a pre tag in html, how to append the indented text to the previous line in Python


Example URL https://bioconductor.org/packages/release/bioc/VIEWS

Currently I'm splitting each individual clump of metadata by every blank line, then converting to a dictionary splitting on the first colon using the string before as the key and the string after as the value. THE ISSUE I'm running is that I am going line by line through each package metadata, some lines do not have colons and I want to append that to the previous value as one complete string.

    response = requests.get(
    'https://bioconductor.org/packages/release/bioc/VIEWS')

    package_list = response.text.split('\n\n')

    package_dict = {
        package_list.split(':')[0]: package_list.split(':')[1] for package in package_list
    }

Solution

  • Try using regex to parse the data:

    import re
    import requests
    
    url = "https://bioconductor.org/packages/release/bioc/VIEWS"
    data = requests.get(url).text
    
    pat = re.compile(
        r"^([^\s][^:]*): (.+?)\s*(?=^[^\s][^:]*:|\Z)", flags=re.S | re.M
    )
    
    out = []
    for chunk in data.split("\n\n"):
        if chunk:
            out.append(dict(pat.findall(chunk)))
    
    print(out)
    

    Prints:

    [
        {
            "Package": "a4",
            "Version": "1.44.0",
            "Depends": "a4Base, a4Preproc, a4Classif, a4Core, a4Reporting",
            "Suggests": "MLP, nlcv, ALL, Cairo, Rgraphviz, GOstats",
            "License": "GPL-3",
            "MD5sum": "cc696d3373a9f258d293f2d966da11d5",
            "NeedsCompilation": "no",
            "Title": "Automated Affymetrix Array Analysis Umbrella Package",
            "Description": "Umbrella package is available for the entire Automated\n        Affymetrix Array Analysis suite of package.",
            "biocViews": "Microarray",
            "Author": "Willem Talloen [aut], Tobias Verbeke [aut], Laure Cougnaud\n        [cre]",
            "Maintainer": "Laure Cougnaud <[email protected]>",
            "git_url": "https://git.bioconductor.org/packages/a4",
            "git_branch": "RELEASE_3_15",
            "git_last_commit": "5b0fc5a",
            "git_last_commit_date": "2022-04-26",
            "Date/Publication": "2022-04-26",
            "source.ver": "src/contrib/a4_1.44.0.tar.gz",
            "win.binary.ver": "bin/windows/contrib/4.2/a4_1.44.0.zip",
            "mac.binary.ver": "bin/macosx/contrib/4.2/a4_1.44.0.tgz",
            "vignettes": "vignettes/a4/inst/doc/a4vignette.pdf",
            "vignetteTitles": "a4vignette",
            "hasREADME": "FALSE",
            "hasNEWS": "TRUE",
            "hasINSTALL": "FALSE",
            "hasLICENSE": "FALSE",
            "Rfiles": "vignettes/a4/inst/doc/a4vignette.R",
            "dependencyCount": "82"
        },
        {
            "Package": "a4Base",
            "Version": "1.44.0",
            "Depends": "a4Preproc, a4Core",
            "Imports": "methods, graphics, grid, Biobase, annaffy, mpm, genefilter,\n        limma, multtest, glmnet, gplots",
            "Suggests": "Cairo, ALL, hgu95av2.db, nlcv",
            "Enhances": "gridSVG, JavaGD",
            "License": "GPL-3",
            "MD5sum": "094c0a1c87b18ff8f16a3dbe4d06da64",
            "NeedsCompilation": "no",
            "Title": "Automated Affymetrix Array Analysis Base Package",
            "Description": "Base utility functions are available for the Automated\n        Affymetrix Array Analysis set of packages.",
            "biocViews": "Microarray",
            "Author": "Willem Talloen [aut], Tine Casneuf [aut], An De Bondt [aut],\n        Steven Osselaer [aut], Hinrich Goehlmann [aut], Willem\n        Ligtenberg [aut], Tobias Verbeke [aut], Laure Cougnaud [cre]",
            "Maintainer": "Laure Cougnaud <[email protected]>",
            "git_url": "https://git.bioconductor.org/packages/a4Base",
            "git_branch": "RELEASE_3_15",
            "git_last_commit": "9ae69e0",
            "git_last_commit_date": "2022-04-26",
            "Date/Publication": "2022-04-26",
            "source.ver": "src/contrib/a4Base_1.44.0.tar.gz",
            "win.binary.ver": "bin/windows/contrib/4.2/a4Base_1.44.0.zip",
            "mac.binary.ver": "bin/macosx/contrib/4.2/a4Base_1.44.0.tgz",
            "hasREADME": "FALSE",
            "hasNEWS": "TRUE",
            "hasINSTALL": "FALSE",
            "hasLICENSE": "FALSE",
            "dependsOnMe": "a4",
            "suggestsMe": "epimutacions",
            "dependencyCount": "73"
        },
     
    ...and so on.