import re
#example 1
input_text = "((PERSON)María Rosa) ((VERB)pasará) unos dias aqui, hay que ((VERB)mover) sus cosas viejas de aqui, ya que sus cosméticos ((VERB)estorban) si ((VERB)estan) tirados por aquí. ((PERSON)Cyntia) es una buena modelo, su cabello es muy bello, hay que ((VERB)lavar) su cabello"
#example 2
input_text = "Sus útiles escolares ((VERB)estan) aqui, me sorprende que ((PERSON)Juan Carlos) los haya olvidado siendo que suele ((VERB)ser) tan cuidadoso con sus útiles."
#I need replace "sus" or "su" but under certain conditions
subject_capture_pattern = r"\(\(PERSON\)((?:\w\s*)+)\)" #underlined in red in the image
associated_info_capture_pattern = r"(?:sus|su)\s+((?:\w\s*)+)(?:\s+(?:del|de )|\s*(?:\(\(VERB\)|[.,;]))" #underlined in green in the image
identification_pattern =
replacement_sequence =
input_text = re.sub(identification_pattern, replacement_sequence, input_text, flags = re.IGNORECASE)
this is the correct output:
#for example 1
"((PERSON)María Rosa) ((VERB)pasará) unos dias aqui, hay que ((VERB)mover) cosas viejas ((CONTEXT) de María Rosa) de aqui, ya que cosméticos ((CONTEXT) de María Rosa) ((VERB)estorban) si ((VERB)estan) tirados por aquí. ((PERSON)Cyntia) es una buena modelo, cabello ((CONTEXT) de Cyntia) ((VERB)es) muy bello, hay que ((VERB)lavar) cabello ((CONTEXT) de Cyntia)"
#for example 2
"útiles escolares ((CONTEXT) NO DATA) ((VERB)estan) aqui, me sorprende que ((PERSON)Juan Carlos) los haya olvidado siendo que suele ((VERB)ser) tan cuidadoso con útiles ((CONTEXT) Juan Carlos)."
Details:
Replace the possessive pronouns "sus"
or "su"
with "de " + the content inside the last ((PERSON) "THIS SUBSTRING")
, and if there is no ((PERSON) "THIS SUBSTRING")
before then replace sus
or su
with ((PERSON) NO DATA)
Sentences are read from left to right, so the replacement will be the substring inside the parentheses ((PERSON)the substring)
before that "sus"
or "su"
, as shown in the example.
In the end, the replaced substrings should end up with this structure:
associated_info_capture_pattern + "((CONTEXT)" + subject_capture_pattern + ")"
This shows a way to do the replacement of su/sus like you asked for (albeit not with just a single re.sub). I didn't move the additional info, but you could modify it to handle that as well.
import re
subject_capture_pattern = r"\(\(PERSON\)((?:\w\s*)+)\)"
def replace_su_and_sus(input_text):
start = 0
replacement = "((PERSON) NO DATA)"
output_text = ""
for m in re.finditer(subject_capture_pattern, input_text):
output_text += re.sub(r"\b[Ss]us?\b", replacement, input_text[start:m.end()])
start = m.end()
replacement = m.group(0).replace("(PERSON)", "(CONTEXT) de ")
output_text += re.sub(r"\b[Ss]us?\b", replacement, input_text[start:])
return output_text
My strategy was: