Search code examples
python-3.xbeautifulsouphtml-parsing

Beautiful soup 4 find all input tags for particular Div id


I am trying to extract the value of the highlighted input tag. Currently, I can get the result div id="WopiDocWACContainer" style="overflow:hidden; by running below code

import requests
from bs4 import BeautifulSoup
Page = requests.get('https://editproj.sharepoint.com/:x:/g/Ea32XJl_g9VBreFAia_zMmEBY6FW2ZWh8F4VeJ1Rt5Z4YA?e=rpUKYv')
soup = BeautifulSoup(Page.text, 'html.parser')
fromtag = soup.find_all("div", {"id" : "WopiDocWACContainer"})
print(fromtag)

enter image description here

I tried below code to narrow down and get the all the input tag but I am unable to get

for tag in fromtag:
    inputtag = fromtag.find("input",{"type" : "hidden"})
    for tag in inputtag:
        print (tag.text)

How can the extract the value of the highlighted input in the picture above?


Solution

  • Page is loaded with JavaScript, requests module will not be able to render the JS.

    For your case, The desired output is actually presented within HTML script tag. So I've quickly parsed it with regex.

    import requests
    import re
    
    r = requests.get(
        "https://editproj.sharepoint.com/:x:/g/Ea32XJl_g9VBreFAia_zMmEBY6FW2ZWh8F4VeJ1Rt5Z4YA?rtime=CJOEsjTZ10g")
    
    match = re.search(r'FileGetUrl":"(.*?)"', r.text)
    
    print(match.group(1))
    

    Output:

    https://editproj.sharepoint.com/_layouts/15/download.aspx?UniqueId=995cf6ad-837f-41d5-ade1-4089aff33261\u0026Translate=false\u0026tempauth=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.eyJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvZWRpdHByb2ouc2hhcmVwb2ludC5jb21AZDJjZTI4MGQtYWExMi00ODQxLWFjYjMtOWYxZDNlMDYzYjhkIiwiaXNzIjoiMDAwMDAwMDMtMDAwMC0wZmYxLWNlMDAtMDAwMDAwMDAwMDAwIiwibmJmIjoiMTU4NjA3MzQwNCIsImV4cCI6IjE1ODYxMDk0MDQiLCJlbmRwb2ludHVybCI6Imtsc0lNb1NtVDQyejBXY085ZGQ2bHovUUJ3ZUVuZzZRd0MxcmdkTGxsVEU9IiwiZW5kcG9pbnR1cmxMZW5ndGgiOiIxMTkiLCJpc2xvb3BiYWNrIjoiVHJ1ZSIsImNpZCI6Ik5EVmlPRFExT1dZdE9UQmhNaTFoTURBd0xXTmlOemN0TTJSaE4yVTBZMlF6WWpWaiIsInZlciI6Imhhc2hlZHByb29mdG9rZW4iLCJzaXRlaWQiOiJNVFEwWmpsbU0yRXRNakV5TnkwME16RmhMV0ppTmpBdE1EY3dNbUV5TXpnNVpqQTMiLCJuYW1laWQiOiIwIy5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2IiwibmlpIjoibWljcm9zb2Z0LnNoYXJlcG9pbnQiLCJpc3VzZXIiOiJ0cnVlIiwiY2FjaGVrZXkiOiIwaC5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2Iiwic2hhcmluZ2lkIjoiNGtZU0VmaDFaMGlObWM3NnV1bkl6dyIsInR0IjoiMCIsInVzZVBlcnNpc3RlbnRDb29raWUiOiIyIn0.R3NjZWhxKzZobmI2bVhSK1JvZzNqUFl0QUw4SDhiTHlETkdQUWQ5MFZjVT0
    

    For full load:

    import requests
    import re
    import json
    
    r = requests.get(
        "https://editproj.sharepoint.com/:x:/g/Ea32XJl_g9VBreFAia_zMmEBY6FW2ZWh8F4VeJ1Rt5Z4YA?rtime=CJOEsjTZ10g")
    
    match = re.search(r"var _wopiContextJson =({.+})", r.text).group(1)
    
    data = json.loads(match)
    
    print(data.keys())
    
    # print(json.dumps(data, indent=4)) # to see it in human readable format.
    

    Output:

    dict_keys(['HostName', 'SessionId', 'UserId', 'WebAppUrl', 'FileName', 'FileSize', 'FileGetUrl', 'BundleMajorVersion', 'BundleUrl', 'ReadOnly', 'IrmEnabled', 'LabelIrmed', 'LastModified', 'ServerStartTime', 'ServerCompleteTime', 'DocUniqueId', 'CTag', 'ETag', 'RumOneUpdate', 'OpenWacInPlace', 'TemplateInfo', 'BundleStaleness', 'IsAsyncBundleStale', 'IsActivatedAsyncPreviewKillSwich', 'ViewOnly', 'DelayLoadResources', 'DocAspxSingleFlush', 'Origin', 'Slrid', 'InteractiveReadonlyExperiment', 'ClickTime', 'UniqueClick', 'HostGeo', 'PredictedOfficeAppEndPoint', 'PreseededSessionKey', 'PreseededWacSessionId', 'ParentFolderFullUrl', 'DocAgeBucketAtViewTimeBasedOnLastModifiedTime', 'DocAgeBucketAtViewTimeBasedOnLastWrittenTime', 'DocCategoryBasedOnLastModifiedTime', 'DocCategoryBasedOnLastWrittenTime', 'SSRGenerationReason', 'RecordAgeBucketsAndCategoryForRumOneUsingWopicontext', 'ListItemId', 'ListId', 'AllowedOrigins', 'IsPragueDocument', 'PragueSocketStorageDiscovery', 'ResetUriToAddressBarLink', 'IsEduUser']) 
    
    {
        "HostName": "SharePoint Online",
        "SessionId": "ABB8459F-300E-A000-C5E7-EB06D6377846",        
        "UserId": "",
        "WebAppUrl": "https://excel.officeapps.live.com/x/_layouts/xlviewerinternal.aspx?unified=1&ui=en%2DUS&rs=en%2DUS&WOPISrc=https%3A%2F%2Feditproj%2Esharepoint%2Ecom%2F%5Fvti%5Fbin%2Fwopi%2Eashx%2Ffiles%2F995cf6ad837f41d5ade14089aff33261&wdEnableRoaming=1&mscc=0&hid=abb8459f-300e-a000-c5e7-eb06d6377846",
        "FileName": "coronavirus-school-closures-data.xlsx",        
        "FileSize": 22439,
        "FileGetUrl": "https://editproj.sharepoint.com/_layouts/15/download.aspx?UniqueId=995cf6ad-837f-41d5-ade1-4089aff33261&Translate=false&tempauth=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.eyJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvZWRpdHByb2ouc2hhcmVwb2ludC5jb21AZDJjZTI4MGQtYWExMi00ODQxLWFjYjMtOWYxZDNlMDYzYjhkIiwiaXNzIjoiMDAwMDAwMDMtMDAwMC0wZmYxLWNlMDAtMDAwMDAwMDAwMDAwIiwibmJmIjoiMTU4NjA3MzgxOSIsImV4cCI6IjE1ODYxMDk4MTkiLCJlbmRwb2ludHVybCI6Imtsc0lNb1NtVDQyejBXY085ZGQ2bHovUUJ3ZUVuZzZRd0MxcmdkTGxsVEU9IiwiZW5kcG9pbnR1cmxMZW5ndGgiOiIxMTkiLCJpc2xvb3BiYWNrIjoiVHJ1ZSIsImNpZCI6IllXSmlPRFExT1dZdE16QXdaUzFoTURBd0xXTTFaVGN0WldJd05tUTJNemMzT0RRMiIsInZlciI6Imhhc2hlZHByb29mdG9rZW4iLCJzaXRlaWQiOiJNVFEwWmpsbU0yRXRNakV5TnkwME16RmhMV0ppTmpBdE1EY3dNbUV5TXpnNVpqQTMiLCJuYW1laWQiOiIwIy5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2IiwibmlpIjoibWljcm9zb2Z0LnNoYXJlcG9pbnQiLCJpc3VzZXIiOiJ0cnVlIiwiY2FjaGVrZXkiOiIwaC5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2Iiwic2hhcmluZ2lkIjoiS0dISHljQ05Ca0s2Mlk4Z24zLzNSQSIsInR0IjoiMCIsInVzZVBlcnNpc3RlbnRDb29raWUiOiIyIn0.R2k1TU9kWFpKajJ5WHdiOFhtTWNFNmV2cERaTnNMQXlMM1plaDZIc1drOD0",
        "BundleMajorVersion": 1,
        "BundleUrl": "https://editproj.sharepoint.com/_api/v2.0/drives/b!Op9PFCchGkO7YAcCojifB1FMdGKT1DNGuc0VZ45Ny28Z17W52UlFQ5ThVO31JqPP/items/01RTHF455N6ZOJS74D2VA23YKARGX7GMTB/versions/Published/streams/content_preview_O{0}/streamContent?tempauth=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.eyJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvZWRpdHByb2ouc2hhcmVwb2ludC5jb21AZDJjZTI4MGQtYWExMi00ODQxLWFjYjMtOWYxZDNlMDYzYjhkIiwiaXNzIjoiMDAwMDAwMDMtMDAwMC0wZmYxLWNlMDAtMDAwMDAwMDAwMDAwIiwibmJmIjoiMTU4NjA3MzgxOSIsImV4cCI6IjE1ODYwOTU0MTkiLCJlbmRwb2ludHVybCI6IjdxVmtxdVZ6bVlaN1MwQXQyUWR4dytLWktRclpHazlnVkxhVW43TGkyaUE9IiwiZW5kcG9pbnR1cmxMZW5ndGgiOiIxNTYiLCJpc2xvb3BiYWNrIjoiVHJ1ZSIsImNpZCI6IllXSmlPRFExT1dZdE16QXdaUzFoTURBd0xXTTFaVGN0WldJd05tUTJNemMzT0RRMiIsInZlciI6Imhhc2hlZHByb29mdG9rZW4iLCJzaXRlaWQiOiJNVFEwWmpsbU0yRXRNakV5TnkwME16RmhMV0ppTmpBdE1EY3dNbUV5TXpnNVpqQTMiLCJuYW1laWQiOiIwIy5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2IiwibmlpIjoibWljcm9zb2Z0LnNoYXJlcG9pbnQiLCJpc3VzZXIiOiJ0cnVlIiwiY2FjaGVrZXkiOiIwaC5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2Iiwic2hhcmluZ2lkIjoiS0dISHljQ05Ca0s2Mlk4Z24zLzNSQSIsInR0IjoiMCIsInVzZVBlcnNpc3RlbnRDb29raWUiOiIyIn0.Z3BlU3hhWmNSRFI0YitvNkErWFkxU01uZVptZVVpZEgwUHBwNDBuVWhzMD0&usecachedssr=1&prefetchSSRCorrelationId=abb8459f-300e-a000-c5e7-eb06d6377846",
        "ReadOnly": true,
        "IrmEnabled": false,
        "LabelIrmed": false,
        "LastModified": 1585952003000,
        "ServerStartTime": 1586073819363.505,
        "ServerCompleteTime": 0,
        "DocUniqueId": "editproj.sharepoint.com_144f9f3a-2127-431a-bb60-0702a2389f07_995cf6ad-837f-41d5-ade1-4089aff33261",
        "CTag": "{995CF6AD-837F-41D5-ADE1-4089AFF33261},53,73",     
        "ETag": "\"{995CF6AD-837F-41D5-ADE1-4089AFF33261},53\"",    
        "RumOneUpdate": true,
        "OpenWacInPlace": false,
        "TemplateInfo": null,
        "BundleStaleness": 0,
        "IsAsyncBundleStale": false,
        "IsActivatedAsyncPreviewKillSwich": false,
        "ViewOnly": false,
        "DelayLoadResources": true,
        "DocAspxSingleFlush": true,
        "Origin": "Sharing",
        "Slrid": "aab8459f-d0f2-a000-c5e7-e3229e5f2252",
        "InteractiveReadonlyExperiment": "Control",
        "ClickTime": 0,
        "UniqueClick": "504924ca-7e0e-49d9-a411-0779c638ce68",      
        "HostGeo": "US",
        "PredictedOfficeAppEndPoint": "",
        "PreseededSessionKey": null,
        "PreseededWacSessionId": null,
        "ParentFolderFullUrl": "https://editproj.sharepoint.com/Shared Documents/cartomapcoronavirus",
        "DocAgeBucketAtViewTimeBasedOnLastModifiedTime": "1-2 days",
        "DocAgeBucketAtViewTimeBasedOnLastWrittenTime": "1-2 days", 
        "DocCategoryBasedOnLastModifiedTime": "Valid",
        "DocCategoryBasedOnLastWrittenTime": "Valid",
        "SSRGenerationReason": "AnalysisPipeline",
        "RecordAgeBucketsAndCategoryForRumOneUsingWopicontext": true,
        "ListItemId": 37,
        "ListId": "b9b5d719-49d9-4345-94e1-54edf526a3cf",
        "AllowedOrigins": [
            "https://excel.officeapps.live.com"
        ],
        "IsPragueDocument": false,
        "PragueSocketStorageDiscovery": null,
        "ResetUriToAddressBarLink": false,
        "IsEduUser": false
    }