I am processing some strings within lists that look like these:
['COLOR INCLUDES (40)', 'LONG_DESCRIPTION CONTAINS ("BLACK")', 'COLOR INCLUDES (38)']
['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)', 'COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)']
Thing is, I want to merge similar strings with their values into one, for each list. Expecting something like this:
['COLOR INCLUDES (40,38)', 'LONG_DESCRIPTION CONTAINS ("BLACK")']
['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)']
And some strings may have values without ():
['FAMILY EQUALS 1145']
What could be the more pythonic and fastest (lazy :P) way of doing this?
I have tried using regex to match strings until a "(" appears, but some strings don't have values between (), and can't find a fitting solution.
I have also tried STree function from suffix_trees lib, which finds the LCS (Longest Common Subsequence) from a list of strings, but then ran out of ideas about handling the values and the closing parenthesis:
from suffix_trees import STree
st = STree.STree(['COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)',
'COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)'])
st.lcs()
out: 'COLOR INCLUDES ('
EDIT: SOLVED
As @stef in the answer said, I broke the problem in smaller pieces and I solved it with his help. Let me paste here the Class Rule_process and the result:
class Rule_process:
def __init__(self):
self.rules = '(COLOR INCLUDES (40)) OR (LONG_DESCRIPTION CONTAINS ("BLACK")):1|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839):0|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839):0|||COLOR INCLUDES (40):1|||COLOR INCLUDES (800):0'
self.rules_dict = {
0:None,
1:None,
2:None,
4:None,
}
def append_rules(self):
rules = self.rules.split("|||")
values_0 = []
values_1 = []
values_2 = []
values_4 = []
for rule in range(len(rules)):
if rules[rule][-1]=='0':
rules[rule] = rules[rule][:-2]
# self.rules_dict[0].append(rules[rule])
values_0.append(rules[rule])
elif rules[rule][-1]=='1':
rules[rule] = rules[rule][:-2]
# self.rules_dict[1].append(rules[rule])
values_1.append(rules[rule])
elif rules[rule][-1]=='2':
rules[rule] = rules[rule][:-2]
# self.rules_dict[2].append(rules[rule])
values_2.append(rules[rule])
elif rules[rule][-1]=='4':
rules[rule] = rules[rule][:-2]
# self.rules_dict[4].append(rules[rule])
values_4.append(rules[rule])
if values_0!=[]:
self.rules_dict[0] = values_0
if values_1!=[]:
self.rules_dict[1] = values_1
if values_2!=[]:
self.rules_dict[2] = values_2
if values_4!=[]:
self.rules_dict[4] = values_4
regex = r'^\('
# for rules in self.rules_dict.values():
for key in self.rules_dict.keys():
if self.rules_dict[key] is not None:
for rule in range(len(self.rules_dict[key])):
new_rule = self.rules_dict[key][rule].split(' OR ')
if len(new_rule)>1:
joined_rule = []
for r in new_rule:
r = r.replace("))",")")
r = re.sub(regex, "", r)
joined_rule.append(r)
self.rules_dict[key].remove(self.rules_dict[key][rule])
self.rules_dict[key].extend(joined_rule)
self.rules_dict[key] = list(set(self.rules_dict[key]))
else:
new_rule = [r.replace("))",")") for r in new_rule]
new_rule = [re.sub(regex, "", r) for r in new_rule]
new_rule = ", ".join(new_rule)
self.rules_dict[key][rule] = new_rule
self.rules_dict[key] = list(set(self.rules_dict[key]))
return self.rules_dict
def split_rule(self): # COLOR INCLUDES (30,31,32,33) -> name = 'COLOR INCLUDES', values = [30,31,32,33]
# LONG_DESCRIPTION CONTAINS ("BLACK") -> name = LONG_DESCRIPTION, values ='"BLACK"'
new_dict = {
0:None,
1:None,
2:None,
4:None,
}
for key in self.rules_dict.keys():
pql_dict = {}
if self.rules_dict[key] is not None:
for rule in range(len(self.rules_dict[key])): #self.rules_dict[key][rule] -> COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)
rule = self.rules_dict[key][rule]
name = rule.rsplit(maxsplit=1)[0] #------------------------------->COLOR INCLUDES
values_as_str = rule.rsplit(maxsplit=1)[1].replace("(","")
values_as_str = values_as_str.replace(")","") #-------------------------------> 30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839
try:
values = list(map(int, values_as_str.split(","))) # [30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839]
except:
values = values_as_str # '"BLACK"'
if name in pql_dict.keys():
pql_dict[name] = pql_dict[name] + (values)
pql_dict[name] = list(set(pql_dict[name]))
else:
pql_dict.setdefault(name, values)
# pql_dict = {'COLOR INCLUDES': [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]}
for name in pql_dict.keys():
values = pql_dict[name]
joined_rule = name + " " + str(values)
if new_dict[key] is not None:
new_dict[key] = new_dict[key] + [joined_rule]
else:
new_dict[key] = [joined_rule]
self.rules_dict = new_dict
And the result:
process = Rule_process()
process.append_rules()
process.split_rule()
process.rules_dict
OUT:
{0: ['COLOR INCLUDES [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]'],
1: ['COLOR INCLUDES [40]', 'LONG_DESCRIPTION CONTAINS "BLACK"'],
2: None,
4: None}
Split this task into smaller, simpler tasks.
Write a function that takes a string and returns a pair (name, list_of_values)
where name
is the first part of the string and list_of_values
is a python list of integers.
Hint: You can use '(' in s
to test whether string s
contains an opening parenthesis; you can use s.split()
to split on whitespace or s.rsplit(maxsplit=1)
to only split on the last whitespace; s.split('(')
to split on opening parenthesis; and s.split(',')
to split on comma.
Write a function that takes a list of pairs (name, list_of_values)
and merges the lists when the names are equal.
Hint: This is extremely easy in python using a dict with name
as key and list_of_values
as value. You can use if name in d: ... else:
to test whether a name is already in the dict or not; or you can use d.get(name, [])
or d.setdefault(name, [])
to automatically add a name: []
entry in the dict when name
is not already in the dict.
Write a function to convert back, from the pairs (name, list_of_values)
to the strings "name (value1, value2, ...)"
. This task is easier than the first task, so I suggest doing it first.
Hint: ' '.join(...)
and ','.join(...)
can both be useful.