Search code examples
pythonnlpsubstringlcs

How to merge common strings with different values between parenthesis in Python


I am processing some strings within lists that look like these:

['COLOR INCLUDES (40)', 'LONG_DESCRIPTION CONTAINS ("BLACK")', 'COLOR INCLUDES (38)']

['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)', 'COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)']

Thing is, I want to merge similar strings with their values into one, for each list. Expecting something like this:

['COLOR INCLUDES (40,38)', 'LONG_DESCRIPTION CONTAINS ("BLACK")']

['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)']

And some strings may have values without ():

['FAMILY EQUALS 1145']

What could be the more pythonic and fastest (lazy :P) way of doing this?

I have tried using regex to match strings until a "(" appears, but some strings don't have values between (), and can't find a fitting solution.

I have also tried STree function from suffix_trees lib, which finds the LCS (Longest Common Subsequence) from a list of strings, but then ran out of ideas about handling the values and the closing parenthesis:

from suffix_trees import STree
st = STree.STree(['COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)', 
'COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)'])
st.lcs()

out: 'COLOR INCLUDES ('

EDIT: SOLVED

As @stef in the answer said, I broke the problem in smaller pieces and I solved it with his help. Let me paste here the Class Rule_process and the result:


class Rule_process:

    def __init__(self):
        self.rules = '(COLOR INCLUDES (40)) OR (LONG_DESCRIPTION CONTAINS ("BLACK")):1|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839):0|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839):0|||COLOR INCLUDES (40):1|||COLOR INCLUDES (800):0'
        
        self.rules_dict = {
            0:None,
            1:None,
            2:None,
            4:None,
        }
     
    def append_rules(self):

        rules = self.rules.split("|||")

        values_0 = []
        values_1 = []
        values_2 = []
        values_4 = []

        for rule in range(len(rules)):

            if rules[rule][-1]=='0':
                rules[rule] = rules[rule][:-2]
                # self.rules_dict[0].append(rules[rule])
                values_0.append(rules[rule])

            elif rules[rule][-1]=='1':
                rules[rule] = rules[rule][:-2]
                # self.rules_dict[1].append(rules[rule])
                values_1.append(rules[rule])

            elif rules[rule][-1]=='2':
                rules[rule] = rules[rule][:-2]
                # self.rules_dict[2].append(rules[rule])
                values_2.append(rules[rule])

            elif rules[rule][-1]=='4':
                rules[rule] = rules[rule][:-2]
                # self.rules_dict[4].append(rules[rule])
                values_4.append(rules[rule])

        if values_0!=[]:
            self.rules_dict[0] = values_0
        if values_1!=[]:
            self.rules_dict[1] = values_1
        if values_2!=[]:
            self.rules_dict[2] = values_2
        if values_4!=[]:
            self.rules_dict[4] = values_4
        
        regex = r'^\('

        # for rules in self.rules_dict.values():
        for key in self.rules_dict.keys():

            if self.rules_dict[key] is not None:
                for rule in range(len(self.rules_dict[key])):
                    
                    new_rule = self.rules_dict[key][rule].split(' OR ')

                    if len(new_rule)>1:
                        joined_rule = []
                        for r in new_rule:
                            r = r.replace("))",")")
                            r = re.sub(regex, "", r)
                            joined_rule.append(r)
                        
                        self.rules_dict[key].remove(self.rules_dict[key][rule])
                        self.rules_dict[key].extend(joined_rule)
                        self.rules_dict[key] = list(set(self.rules_dict[key]))

                    else:
                        new_rule = [r.replace("))",")") for r in new_rule]
                        new_rule = [re.sub(regex, "", r) for r in new_rule]
                    
                        new_rule = ", ".join(new_rule)

                        self.rules_dict[key][rule] = new_rule
                        self.rules_dict[key] = list(set(self.rules_dict[key]))

        return self.rules_dict

    
    def split_rule(self):   # COLOR INCLUDES (30,31,32,33) -> name = 'COLOR INCLUDES', values = [30,31,32,33]
                            # LONG_DESCRIPTION CONTAINS ("BLACK") -> name = LONG_DESCRIPTION, values ='"BLACK"'
        
        new_dict = {
            0:None,
            1:None,
            2:None,
            4:None,
        }
        
        for key in self.rules_dict.keys():

            pql_dict = {}

            if self.rules_dict[key] is not None:
                for rule in range(len(self.rules_dict[key])): #self.rules_dict[key][rule] -> COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)

                    rule = self.rules_dict[key][rule]

                    name = rule.rsplit(maxsplit=1)[0]  #------------------------------->COLOR INCLUDES
                    values_as_str = rule.rsplit(maxsplit=1)[1].replace("(","")
                    values_as_str = values_as_str.replace(")","") #-------------------------------> 30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839
                    
                    try:
                        values = list(map(int, values_as_str.split(","))) # [30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839]
                    except:
                        values = values_as_str # '"BLACK"'
            
                    if name in pql_dict.keys():
                        pql_dict[name] = pql_dict[name] + (values)
                        pql_dict[name] = list(set(pql_dict[name])) 
                        
                    else:
                        pql_dict.setdefault(name, values)

            # pql_dict = {'COLOR INCLUDES': [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]}
                for name in pql_dict.keys():
                    
                    values = pql_dict[name]
                    joined_rule = name + " " + str(values)
                    
                    if new_dict[key] is not None:
                        new_dict[key] = new_dict[key] + [joined_rule]
                    else:
                        new_dict[key] = [joined_rule]
                   
        self.rules_dict = new_dict

And the result:


process = Rule_process()
process.append_rules()
process.split_rule()
process.rules_dict

OUT:

{0: ['COLOR INCLUDES [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]'],
 1: ['COLOR INCLUDES [40]', 'LONG_DESCRIPTION CONTAINS "BLACK"'],
 2: None,
 4: None}

Solution

  • Split this task into smaller, simpler tasks.

    First task:

    Write a function that takes a string and returns a pair (name, list_of_values) where name is the first part of the string and list_of_values is a python list of integers.

    Hint: You can use '(' in s to test whether string s contains an opening parenthesis; you can use s.split() to split on whitespace or s.rsplit(maxsplit=1) to only split on the last whitespace; s.split('(') to split on opening parenthesis; and s.split(',') to split on comma.

    Second task:

    Write a function that takes a list of pairs (name, list_of_values) and merges the lists when the names are equal.

    Hint: This is extremely easy in python using a dict with name as key and list_of_values as value. You can use if name in d: ... else: to test whether a name is already in the dict or not; or you can use d.get(name, []) or d.setdefault(name, []) to automatically add a name: [] entry in the dict when name is not already in the dict.

    Third task:

    Write a function to convert back, from the pairs (name, list_of_values) to the strings "name (value1, value2, ...)". This task is easier than the first task, so I suggest doing it first.

    Hint: ' '.join(...) and ','.join(...) can both be useful.