Search code examples
pythonms-wordms-officedictionary-comprehension

Word file to Python dictionary


I'm trying to turn a *.docx file with questions into a python dictionary.

The questions have this format:

  1. Question
    a. first answer
    b. second answer
    c. third answer
    d. fourth answer
    e. fifth answer

In the file, the correct answer is the bold one, in this case the third. The word file is built with MS Word bullet points (1. and so on for questions, and a. and so on for answers).

The resulting dictionary should be like:

{
  '1': {  
    
    'question': 'the question text',
    'answer': ['first answer','second answer','third answer','fourth answer','fifth answer'],
    'correct_answer': 2
   },
    
   Other questions...

}

I tried this code:

from docx import *

def is_bold(run):
    return run.bold

# Open the document
doc = Document('sample.docx')

# Create an empty dictionary for questions and answers
questions_and_answers = {}

# Iterate only through paragraphs
for paragraph in doc.paragraphs:
    text = paragraph.text.strip()

    # Check if the paragraph starts with a number and a dot
    if text and text[0].isdigit() and text[1] == '.':
        question_number, question = text.split(' ', 1)
        answer_choices = []
        correct_answer_index = None

        # Continue to the next paragraph that will contain the answers
        next_paragraph = paragraph
        while True:
            next_paragraph = next_paragraph.next_paragraph

            # If there are no more paragraphs or it starts with a number, we've reached the end of the answers
            if not next_paragraph or (next_paragraph.text.strip() and next_paragraph.text.strip()[0].isdigit()):
                break

            next_text = next_paragraph.text.strip()

            # If it starts with a letter and a period, consider it as an answer
            if next_text and next_text[0].isalpha() and next_text[1] == '.':
                answer_run = next_paragraph.runs[0]  # Consider only the first "run" to check the style
                answer_text = next_text[3:]  # Remove the answer format (a., b., c., ...)
                answer_choices.append(answer_text)

                # Check if the answer is bold (hence, correct)
                if is_bold(answer_run):
                    correct_answer_index = len(answer_choices) - 1  # Save the index of the correct answer

        # Add the question and answers to the dictionary
        questions_and_answers[question_number] = {
            'question': question,
            'answers': answer_choices,
            'correct_answer_index': correct_answer_index
        }

# Print the resulting dictionary
for number, data in questions_and_answers.items():
    print(f"{number}: {data['question']}")
    print("Answers:")
    for answer in data['answers']:
        print(f"- {answer}")
    print(f"Index of the correct answer: {data['correct_answer_index']}")
    print()

Unfortunately, I'm getting an empty dictionary. How do I fix this?


Solution

  • Related question

    According to ReadThedocs.Python-DocX: Style-related objects - _NumberingStyle objects, this functionality is not implemented yet.

    But, by merging this solutions, we can do something like this:

    import sys
    import docx
    from docx2python import docx2python as dx2py
    
    
    def ns_tag_name(node, name):
        if node.nsmap and node.prefix:
            return "{{{:s}}}{:s}".format(node.nsmap[node.prefix], name)
        return name
    
    
    def descendants(node, desc_strs):
        if node is None:
            return []
        if not desc_strs:
            return [node]
        ret = {}
        for child_str in desc_strs[0]:
            for child in node.iterchildren(ns_tag_name(node, child_str)):
                descs = descendants(child, desc_strs[1:])
                if not descs:
                    continue
                cd = ret.setdefault(child_str, [])
                if isinstance(descs, list):
                    cd.extend(descs)
                else:
                    cd.append(descs)
        return ret
    
    
    def simplified_descendants(desc_dict):
        ret = []
        for vs in desc_dict.values():
            for v in vs:
                if isinstance(v, dict):
                    ret.extend(simplified_descendants(v))
                else:
                    ret.append(v)
        return ret
    
    
    def process_list_data(attrs):
        desc = simplified_descendants(attrs)[0]
        level = int(desc.attrib[ns_tag_name(desc, "val")])
        return level
    
    
    def collect_list_with_levels():
        fname = r"./doc.docx"
        docd = docx.Document(fname)
        docdpy = dx2py(fname)
        result = []
        docdpy_runs = docdpy.document_runs[0][0][0]
        if len(docd.paragraphs) != len(docdpy_runs):
            print("Lengths don't match. Abort")
            return -1
    
        subnode_tags = (("pPr",), ("numPr",), ("ilvl",))  # (("pPr",), ("numPr",), ("ilvl", "numId"))  # numId is for matching elements from word/numbering.xml
        for idx, (par, l) in enumerate(zip(docd.paragraphs, docdpy_runs)):
            numbered_attrs = descendants(par._element, subnode_tags)
            is_bold = any(run.font.bold for run in par.runs)
            if numbered_attrs:
                result.append({
                    "text": par.text,
                    "level": process_list_data(numbered_attrs),
                    "bold": is_bold
                })
        return result
    
    
    def build_qa_dict(docx_content):
        qa_dict = {}
        question_counter = 0
        current_question_text = None
        answers = []
        correct_answer_index = None
    
        for par in docx_content:
            # Check if paragraph is a question or an answer based on its style or level
            is_question = not par["level"]
    
            if is_question:
                if current_question_text is not None:
                    # Save previous question and answers
                    qa_dict[str(question_counter)] = {
                        'question': current_question_text,
                        'answers': answers,
                        'correct_answer': correct_answer_index
                    }
                question_counter += 1
                current_question_text = par['text']
                answers = []
                correct_answer_index = None
            else:
                answers.append(par['text'])
                if par['bold']:  # Assuming is_bold attribute is set
                    correct_answer_index = len(answers)  # -1 if starts with 0
    
        # Save the last question
        if current_question_text and answers:
            qa_dict[str(question_counter)] = {
                'question': current_question_text,
                'answers': answers,
                'correct_answer': correct_answer_index
            }
    
        return qa_dict
    
    
    if __name__ == "__main__":
        data = collect_list_with_levels()
        qa_dict = build_qa_dict(data)
        print(qa_dict)
    

    Input

    input-docx-list

    Output

    {
      "1": {
        "question": "Question",
        "answers": [
          "Answer",
          "Answer2",
          "AnswerCorrect"
        ],
        "correct_answer": 3
      },
      "2": {
        "question": "Question2",
        "answers": [
          "AnswerNew",
          "AnswerCorrect2",
          "AnswerNew2",
          "AnswerNew3"
        ],
        "correct_answer": 2
      }
    }