Search code examples
pythonpandasgoogle-docs-api

Extract information from Google Doc into Pandas DF


I'm looking for a way to extract information from my private Google Doc and put them into DF to perform some simple analysis.

I'm connected to the Google Doc API and able to read the document but I'm struggling with extracting only relevant information and in correct order.

This is how my doc looks like: doc

And this is the output I need to have:

table

I would be able to create separate regexes for single items but how to group it into table like above, given that number of paragraphs for single date can vary (same for tags)?

Thanks

This is what I received from API:

{'title': 'test',
 'body': {'content': [{'endIndex': 1,
    'sectionBreak': {'sectionStyle': {'columnSeparatorStyle': 'NONE',
      'contentDirection': 'LEFT_TO_RIGHT',
      'sectionType': 'CONTINUOUS'}}},
   {'startIndex': 1,
    'endIndex': 11,
    'paragraph': {'elements': [{'startIndex': 1,
       'endIndex': 11,
       'textRun': {'content': 'Doc Title\n',
        'textStyle': {'bold': True,
         'fontSize': {'magnitude': 13, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'alignment': 'CENTER',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 11,
    'endIndex': 12,
    'paragraph': {'elements': [{'startIndex': 11,
       'endIndex': 12,
       'textRun': {'content': '\n',
        'textStyle': {'bold': True,
         'fontSize': {'magnitude': 13, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'alignment': 'CENTER',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 12,
    'endIndex': 13,
    'paragraph': {'elements': [{'startIndex': 12,
       'endIndex': 13,
       'textRun': {'content': '\n',
        'textStyle': {'bold': True,
         'fontSize': {'magnitude': 13, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'alignment': 'CENTER',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 13,
    'endIndex': 23,
    'paragraph': {'elements': [{'startIndex': 13,
       'endIndex': 23,
       'textRun': {'content': '7.03.2023\n',
        'textStyle': {'bold': True,
         'fontSize': {'magnitude': 13, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'alignment': 'CENTER',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 23,
    'endIndex': 24,
    'paragraph': {'elements': [{'startIndex': 23,
       'endIndex': 24,
       'textRun': {'content': '\n',
        'textStyle': {'bold': True,
         'fontSize': {'magnitude': 13, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'alignment': 'CENTER',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 24,
    'endIndex': 27,
    'paragraph': {'elements': [{'startIndex': 24,
       'endIndex': 27,
       'textRun': {'content': '1.\n',
        'textStyle': {'bold': True,
         'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 27,
    'endIndex': 41,
    'paragraph': {'elements': [{'startIndex': 27,
       'endIndex': 41,
       'textRun': {'content': 'Example text…\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 41,
    'endIndex': 42,
    'paragraph': {'elements': [{'startIndex': 41,
       'endIndex': 42,
       'textRun': {'content': '\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 42,
    'endIndex': 55,
    'paragraph': {'elements': [{'startIndex': 42,
       'endIndex': 55,
       'textRun': {'content': '#tag1, #tag2\n',
        'textStyle': {'underline': True,
         'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 55,
    'endIndex': 56,
    'paragraph': {'elements': [{'startIndex': 55,
       'endIndex': 56,
       'textRun': {'content': '\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 56,
    'endIndex': 59,
    'paragraph': {'elements': [{'startIndex': 56,
       'endIndex': 59,
       'textRun': {'content': '2.\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 59,
    'endIndex': 73,
    'paragraph': {'elements': [{'startIndex': 59,
       'endIndex': 73,
       'textRun': {'content': 'Example text…\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 73,
    'endIndex': 74,
    'paragraph': {'elements': [{'startIndex': 73,
       'endIndex': 74,
       'textRun': {'content': '\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 74,
    'endIndex': 94,
    'paragraph': {'elements': [{'startIndex': 74,
       'endIndex': 94,
       'textRun': {'content': '#tag1, #tag2, #tag3\n',
        'textStyle': {'underline': True,
         'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 94,
    'endIndex': 95,
    'paragraph': {'elements': [{'startIndex': 94,
       'endIndex': 95,
       'textRun': {'content': '\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 95,
    'endIndex': 105,
    'paragraph': {'elements': [{'startIndex': 95,
       'endIndex': 105,
       'textRun': {'content': '8.03.2023\n',
        'textStyle': {'bold': True,
         'fontSize': {'magnitude': 13, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'alignment': 'CENTER',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 105,
    'endIndex': 106,
    'paragraph': {'elements': [{'startIndex': 105,
       'endIndex': 106,
       'textRun': {'content': '\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 106,
    'endIndex': 109,
    'paragraph': {'elements': [{'startIndex': 106,
       'endIndex': 109,
       'textRun': {'content': '1.\n',
        'textStyle': {'bold': True,
         'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 109,
    'endIndex': 123,
    'paragraph': {'elements': [{'startIndex': 109,
       'endIndex': 123,
       'textRun': {'content': 'Example text…\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 123,
    'endIndex': 124,
    'paragraph': {'elements': [{'startIndex': 123,
       'endIndex': 124,
       'textRun': {'content': '\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 124,
    'endIndex': 137,
    'paragraph': {'elements': [{'startIndex': 124,
       'endIndex': 137,
       'textRun': {'content': '#tag1, #tag2\n',
        'textStyle': {'underline': True,
         'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 137,
    'endIndex': 138,
    'paragraph': {'elements': [{'startIndex': 137,
       'endIndex': 138,
       'textRun': {'content': '\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 138,
    'endIndex': 141,
    'paragraph': {'elements': [{'startIndex': 138,
       'endIndex': 141,
       'textRun': {'content': '2.\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 141,
    'endIndex': 155,
    'paragraph': {'elements': [{'startIndex': 141,
       'endIndex': 155,
       'textRun': {'content': 'Example text…\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 155,
    'endIndex': 156,
    'paragraph': {'elements': [{'startIndex': 155,
       'endIndex': 156,
       'textRun': {'content': '\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 156,
    'endIndex': 176,
    'paragraph': {'elements': [{'startIndex': 156,
       'endIndex': 176,
       'textRun': {'content': '#tag1, #tag2, #tag3\n',
        'textStyle': {'underline': True,
         'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 176,
    'endIndex': 177,
    'paragraph': {'elements': [{'startIndex': 176,
       'endIndex': 177,
       'textRun': {'content': '\n',
        'textStyle': {'underline': True,
         'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 177,
    'endIndex': 180,
    'paragraph': {'elements': [{'startIndex': 177,
       'endIndex': 180,
       'textRun': {'content': '3.\n',
        'textStyle': {'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}},
   {'startIndex': 180,
    'endIndex': 186,
    'paragraph': {'elements': [{'startIndex': 180,
       'endIndex': 186,
       'textRun': {'content': '#tag1\n',
        'textStyle': {'underline': True,
         'fontSize': {'magnitude': 10, 'unit': 'PT'}}}}],
     'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
      'direction': 'LEFT_TO_RIGHT'}}}]},
 'documentStyle': {'background': {'color': {}},
  'pageNumberStart': 1,
  'marginTop': {'magnitude': 72, 'unit': 'PT'},
  'marginBottom': {'magnitude': 72, 'unit': 'PT'},
  'marginRight': {'magnitude': 72, 'unit': 'PT'},
  'marginLeft': {'magnitude': 72, 'unit': 'PT'},
  'pageSize': {'height': {'magnitude': 841.8897637795277, 'unit': 'PT'},
   'width': {'magnitude': 595.2755905511812, 'unit': 'PT'}},
  'marginHeader': {'magnitude': 36, 'unit': 'PT'},
  'marginFooter': {'magnitude': 36, 'unit': 'PT'},
  'useCustomHeaderFooterMargins': True},
 'namedStyles': {'styles': [{'namedStyleType': 'NORMAL_TEXT',
    'textStyle': {'bold': False,
     'italic': False,
     'underline': False,
     'strikethrough': False,
     'smallCaps': False,
     'backgroundColor': {},
     'foregroundColor': {'color': {'rgbColor': {}}},
     'fontSize': {'magnitude': 11, 'unit': 'PT'},
     'weightedFontFamily': {'fontFamily': 'Arial', 'weight': 400},
     'baselineOffset': 'NONE'},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'alignment': 'START',
     'lineSpacing': 115,
     'direction': 'LEFT_TO_RIGHT',
     'spacingMode': 'COLLAPSE_LISTS',
     'spaceAbove': {'unit': 'PT'},
     'spaceBelow': {'unit': 'PT'},
     'borderBetween': {'color': {},
      'width': {'unit': 'PT'},
      'padding': {'unit': 'PT'},
      'dashStyle': 'SOLID'},
     'borderTop': {'color': {},
      'width': {'unit': 'PT'},
      'padding': {'unit': 'PT'},
      'dashStyle': 'SOLID'},
     'borderBottom': {'color': {},
      'width': {'unit': 'PT'},
      'padding': {'unit': 'PT'},
      'dashStyle': 'SOLID'},
     'borderLeft': {'color': {},
      'width': {'unit': 'PT'},
      'padding': {'unit': 'PT'},
      'dashStyle': 'SOLID'},
     'borderRight': {'color': {},
      'width': {'unit': 'PT'},
      'padding': {'unit': 'PT'},
      'dashStyle': 'SOLID'},
     'indentFirstLine': {'unit': 'PT'},
     'indentStart': {'unit': 'PT'},
     'indentEnd': {'unit': 'PT'},
     'keepLinesTogether': False,
     'keepWithNext': False,
     'avoidWidowAndOrphan': True,
     'shading': {'backgroundColor': {}},
     'pageBreakBefore': False}},
   {'namedStyleType': 'HEADING_1',
    'textStyle': {'fontSize': {'magnitude': 20, 'unit': 'PT'}},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'direction': 'LEFT_TO_RIGHT',
     'spaceAbove': {'magnitude': 20, 'unit': 'PT'},
     'spaceBelow': {'magnitude': 6, 'unit': 'PT'},
     'keepLinesTogether': True,
     'keepWithNext': True,
     'pageBreakBefore': False}},
   {'namedStyleType': 'HEADING_2',
    'textStyle': {'bold': False, 'fontSize': {'magnitude': 16, 'unit': 'PT'}},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'direction': 'LEFT_TO_RIGHT',
     'spaceAbove': {'magnitude': 18, 'unit': 'PT'},
     'spaceBelow': {'magnitude': 6, 'unit': 'PT'},
     'keepLinesTogether': True,
     'keepWithNext': True,
     'pageBreakBefore': False}},
   {'namedStyleType': 'HEADING_3',
    'textStyle': {'bold': False,
     'foregroundColor': {'color': {'rgbColor': {'red': 0.2627451,
        'green': 0.2627451,
        'blue': 0.2627451}}},
     'fontSize': {'magnitude': 14, 'unit': 'PT'}},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'direction': 'LEFT_TO_RIGHT',
     'spaceAbove': {'magnitude': 16, 'unit': 'PT'},
     'spaceBelow': {'magnitude': 4, 'unit': 'PT'},
     'keepLinesTogether': True,
     'keepWithNext': True,
     'pageBreakBefore': False}},
   {'namedStyleType': 'HEADING_4',
    'textStyle': {'foregroundColor': {'color': {'rgbColor': {'red': 0.4,
        'green': 0.4,
        'blue': 0.4}}},
     'fontSize': {'magnitude': 12, 'unit': 'PT'}},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'direction': 'LEFT_TO_RIGHT',
     'spaceAbove': {'magnitude': 14, 'unit': 'PT'},
     'spaceBelow': {'magnitude': 4, 'unit': 'PT'},
     'keepLinesTogether': True,
     'keepWithNext': True,
     'pageBreakBefore': False}},
   {'namedStyleType': 'HEADING_5',
    'textStyle': {'foregroundColor': {'color': {'rgbColor': {'red': 0.4,
        'green': 0.4,
        'blue': 0.4}}},
     'fontSize': {'magnitude': 11, 'unit': 'PT'}},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'direction': 'LEFT_TO_RIGHT',
     'spaceAbove': {'magnitude': 12, 'unit': 'PT'},
     'spaceBelow': {'magnitude': 4, 'unit': 'PT'},
     'keepLinesTogether': True,
     'keepWithNext': True,
     'pageBreakBefore': False}},
   {'namedStyleType': 'HEADING_6',
    'textStyle': {'italic': True,
     'foregroundColor': {'color': {'rgbColor': {'red': 0.4,
        'green': 0.4,
        'blue': 0.4}}},
     'fontSize': {'magnitude': 11, 'unit': 'PT'}},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'direction': 'LEFT_TO_RIGHT',
     'spaceAbove': {'magnitude': 12, 'unit': 'PT'},
     'spaceBelow': {'magnitude': 4, 'unit': 'PT'},
     'keepLinesTogether': True,
     'keepWithNext': True,
     'pageBreakBefore': False}},
   {'namedStyleType': 'TITLE',
    'textStyle': {'fontSize': {'magnitude': 26, 'unit': 'PT'}},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'direction': 'LEFT_TO_RIGHT',
     'spaceAbove': {'unit': 'PT'},
     'spaceBelow': {'magnitude': 3, 'unit': 'PT'},
     'keepLinesTogether': True,
     'keepWithNext': True,
     'pageBreakBefore': False}},
   {'namedStyleType': 'SUBTITLE',
    'textStyle': {'italic': False,
     'foregroundColor': {'color': {'rgbColor': {'red': 0.4,
        'green': 0.4,
        'blue': 0.4}}},
     'fontSize': {'magnitude': 15, 'unit': 'PT'},
     'weightedFontFamily': {'fontFamily': 'Arial', 'weight': 400}},
    'paragraphStyle': {'namedStyleType': 'NORMAL_TEXT',
     'direction': 'LEFT_TO_RIGHT',
     'spaceAbove': {'unit': 'PT'},
     'spaceBelow': {'magnitude': 16, 'unit': 'PT'},
     'keepLinesTogether': True,
     'keepWithNext': True,
     'pageBreakBefore': False}}]},
 'revisionId': 'xxxxxxx',
 'suggestionsViewMode': 'SUGGESTIONS_INLINE',
 'documentId': 'xxxxxxx'}

Solution

  • Suppose d the dictionary returned by Google API, you can use:

    import itertools
    
    # Extract content data
    content = itertools.chain(*[c['paragraph']['elements']
                                   for c in d['body']['content']
                                   if 'paragraph' in c])
    
    # Extract expected data
    df = (pd.json_normalize(content)['textRun.content']
            .str.strip().loc[lambda x: x.str.len() > 0]
            .str.extract('^(?P<Date>\d+\.\d+\.\d+)|(?P<Paragraph>\d+\.)|(?P<Tags>#.*)')
            .dropna(how='all'))
    
    # Filter dataframe
    out = (df.ffill().loc[df['Tags'].notna()]
             .assign(Tags=lambda x: x['Tags'].str.split(',\s*'))
             .reset_index(drop=True))
    

    Output:

    >>> out
            Date Paragraph                   Tags
    0  7.03.2023        1.         [#tag1, #tag2]
    1  7.03.2023        2.  [#tag1, #tag2, #tag3]
    2  8.03.2023        1.         [#tag1, #tag2]
    3  8.03.2023        2.  [#tag1, #tag2, #tag3]
    4  8.03.2023        3.                [#tag1]