I have some pascal-cased text that I'm trying to split into separate tokens/words.
For example, "Hello123AIIsCool"
would become ["Hello", "123", "AI", "Is", "Cool"]
.
"Hello"
"123"
-> ["123"]
, not ["1", "2", "3"]
"ABCat"
-> ["AB", "Cat"]
, not ["ABC", "at"]
"Hello"
, "HelloAI"
, "HelloAIIsCool"
"Hello123"
, "123AI"
, "AIIsCool"
, and any other combination I haven't provided are potential candidates.I've tried a couple regex variations. The following two attempts got me pretty close to what I want, but not quite.
import re
def extract_v0(string: str) -> list[str]:
word_pattern = r"[A-Z][a-z]*"
num_pattern = r"\d+"
pattern = f"{word_pattern}|{num_pattern}"
extracts: list[str] = re.findall(
pattern=pattern, string=string
)
return extracts
string = "Hello123AIIsCool"
extract_v0(string)
['Hello', '123', 'A', 'I', 'Is', 'Cool']
import re
def extract_v1(string: str) -> list[str]:
word_pattern = r"[A-Z][a-z]+"
num_pattern = r"\d+"
upper_pattern = r"[A-Z][^a-z]*"
pattern = f"{word_pattern}|{num_pattern}|{upper_pattern}"
extracts: list[str] = re.findall(
pattern=pattern, string=string
)
return extracts
string = "Hello123AIIsCool"
extract_v1(string)
['Hello', '123', 'AII', 'Cool']
This uses a combination of regex and looping. It works, but is this the best solution? Or is there some fancy regex that can do it?
import re
def extract_v2(string: str) -> list[str]:
word_pattern = r"[A-Z][a-z]+"
num_pattern = r"\d+"
upper_pattern = r"[A-Z][A-Z]*"
groups = []
for pattern in [word_pattern, num_pattern, upper_pattern]:
while string.strip():
group = re.search(pattern=pattern, string=string)
if group is not None:
groups.append(group)
string = string[:group.start()] + " " + string[group.end():]
else:
break
ordered = sorted(groups, key=lambda g: g.start())
return [grp.group() for grp in ordered]
string = "Hello123AIIsCool"
extract_v2(string)
['Hello', '123', 'AI', 'Is', 'Cool']
Based on your Version 1:
import re
def extract_v1(string: str) -> list[str]:
word_pattern = r"[A-Z][a-z]+"
num_pattern = r"\d+"
upper_pattern = r"[A-Z]+(?![a-z])" # Fixed
pattern = f"{word_pattern}|{num_pattern}|{upper_pattern}"
extracts: list[str] = re.findall(
pattern=pattern, string=string
)
return extracts
string = "Hello123AIIsCool"
extract_v1(string)
Result:
['Hello', '123', 'AI', 'Is', 'Cool']
The fixed upper_pattern
will match as many uppercased letters as possible, and will stop one before a lowercased letter if it exists.