Search code examples
pythondictionarykeyreducefunctools

Find intersection of values in dictionary under a same key - python


I have a file with lines, each line is split on "|", I want to compare arguments 5 from each line and if intersect, then proceed. This gets to a second part:

  • first arguments1,2 are compared by dictionary, and if they are same AND
  • if arguments5,6 are overlapping, then those lines get concatenated.

How to compare intersection of values under the same key? The code below works cross-key but not within same key:

from functools import reduce 
reduce(set.intersection, (set(val) for val in query_dict.values()))

Here is an example of lines: text1|text2|text3|text4 text 5| text 6| text 7 text 8| text1|text2|text12|text4 text 5| text 6| text 7| text9|text10|text3|text4 text 5| text 11| text 12 text 8|

The output should be: text1|text2|text3;text12|text4;text5;text4;text5|text6;text7 text8;text6

In other words, only those lines that are matching by 1st,2nd arguments (cells equal) and if 5th,6th arguments are overlapping (intersection) are concatenated.

Here is input file:

Angela Darvill|19036321|School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.|['GB','US']|['Salford', 'Eccles', 'Manchester']
Helen Stanley|19036320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']|['Brighton', 'Brighton']
Angela Darvill|190323121|School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.|['US']|['Brighton', 'Eccles', 'Manchester']
Helen Stanley|19576876320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']|['Brighton', 'Brighton']

The output should look like:

Angela Darvill|19036321;190323121|...
Helen Stanley|19036320;19576876320|...

Angela Darvill gets stacked because two records share same name, same country and same city(-ies).


Solution

  • Based on your improved question :

    import itertools
    
    
    data = """\
    Angela Darvill|19036321|School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.|['GB','US']|['Salford', 'Eccles', 'Manchester']
    Helen Stanley|19036320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']|['Brighton', 'Brighton']
    Angela Darvill|190323121|School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.|['US']|['Brighton', 'Eccles', 'Manchester']
    Helen Stanley|19576876320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']|['Brighton', 'Brighton']
    """
    
    lines = tuple(tuple(line.split('|')) for line in data.splitlines())
    
    results = []
    for line_a_index, line_a in enumerate(lines):
        # we want to compare each line with each other, so we start at index+1
        for line_b_index, line_b in enumerate(lines[line_a_index+1:], start=line_a_index+1):
            assert len(line_a) >= 5, f"not enough cells ({len(line_a)}) in line {line_a_index}"
            assert len(line_b) >= 5, f"not enough cells ({len(line_b)}) in line {line_b_index}"
            assert all(isinstance(cell, str) for cell in line_a)
            assert all(isinstance(cell, str) for cell in line_b)
    
            columns0_are_equal = line_a[0] == line_b[0]
            columns1_are_equal = line_a[1] == line_b[1]
            columns3_are_overlap = set(line_a[3]).issubset(set(line_b[3])) or set(line_b[3]).issubset(set(line_a[3]))
            columns4_are_overlap = set(line_a[4]).issubset(set(line_b[4])) or set(line_b[4]).issubset(set(line_a[4]))
            print(f"between lines index={line_a_index} and index={line_b_index}, {columns0_are_equal=} {columns1_are_equal=} {columns3_are_overlap=} {columns4_are_overlap=}")
            if (
                columns0_are_equal
                # and columns1_are_equal
                and (columns3_are_overlap or columns4_are_overlap)
            ):
                print("MATCH!")
                results.append(
                    (line_a_index, line_b_index,) + tuple(
                        ((cell_a or "") + (";" if (cell_a or cell_b) else "") + (cell_b or "")) if cell_a != cell_b
                        else cell_a
                        for cell_a, cell_b in itertools.zip_longest(line_a, line_b)
                    )
                )
    
    print("Fancy output :")
    lines_to_display = set(itertools.chain.from_iterable((lines[result[0]], lines[result[1]], result[2:]) for result in results))
    columns_widths = (max(len(str(index)) for result in results for index in (result[0], result[1])),) + tuple(
        max(len(cell) for cell in column)
        for column in zip(*lines_to_display)
    )
    
    for width in columns_widths:
        print("-" * width, end="|")
    print("")
    
    for result in results:
        for line_index, original_line in zip((result[0], result[1]), (lines[result[0]], lines[result[1]])):
            for column_index, cell in zip(itertools.count(), (str(line_index),) + original_line):
                if cell:
                    print(cell.ljust(columns_widths[column_index]), end='|')
            print("", end='\n')  # explicit newline
        for column_index, cell in zip(itertools.count(), ("=",) + result[2:]):
            if cell:
                print(cell.ljust(columns_widths[column_index]), end='|')
        print("", end='\n')  # explicit newline
    
    for width in columns_widths:
        print("-" * width, end="|")
    print("")
    
    expected_outputs = """\
    Angela Darvill|19036321;190323121|...
    Helen Stanley|19036320;19576876320|...
    """.splitlines()
    
    for result, expected_output in itertools.zip_longest(results, expected_outputs):
        actual_output = "|".join(result[2:])
        assert actual_output.startswith(expected_output[:-3])  # minus the "..."
    
    -|--------------|--------------------|---------------------------------------------------------------------------------------------------------------------------------------|------------------|------------------------------------------------------------------------|
    0|Angela Darvill|19036321            |School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.                                                   |['GB','US']       |['Salford', 'Eccles', 'Manchester']                                     |
    2|Angela Darvill|190323121           |School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.                                                   |['US']            |['Brighton', 'Eccles', 'Manchester']                                    |
    =|Angela Darvill|19036321;190323121  |School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.                                                   |['GB','US'];['US']|['Salford', 'Eccles', 'Manchester'];['Brighton', 'Eccles', 'Manchester']|
    1|Helen Stanley |19036320            |Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']            |['Brighton', 'Brighton']                                                |
    3|Helen Stanley |19576876320         |Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']            |['Brighton', 'Brighton']                                                |
    =|Helen Stanley |19036320;19576876320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']            |['Brighton', 'Brighton']                                                |
    -|--------------|--------------------|---------------------------------------------------------------------------------------------------------------------------------------|------------------|------------------------------------------------------------------------|
    

    You can see that the lines index 0 and 2 have been merged, same for the lines index 1 and 3.