Double list comprehension for occurrences of a string in a list of strings
Try this out.
import re
def filter_phrases(phrases):
phrase_l = sorted(phrases, key=len)
for i, v in enumerate(phrase_l):
for j in phrase_l[i + 1:]:
if re.search(rf'\b{v}\b', j):
phrases.remove(v)
return phrases
text = [
['hello this is me'],
['oh you know u'],
['a quick brown fox jumps over the lazy dog']
]
phrases = [
['this is', 'u'],
['oh you', 'me'],
['fox', 'brown fox']
]
# Flatten the `text` and `phrases` list
text = [
line for l in text
for line in l
]
phrases = {
phrase for l in phrases
for phrase in l
}
# If you're quite sure that your phrase
# list doesn't have any overlapping
# zones, then I strongly recommend
# against using this `filter_phrases()`
# function.
phrases = filter_phrases(phrases)
result = []
for line in text:
# This is the pattern to match the
# 'space' before the phrases
# in the line on which the split
# is to be done.
l_phrase_1 = '|'.join([
f'(?={phrase})' for phrase in phrases
if re.search(rf'\b{phrase}\b', line)
])
# This is the pattern to match the
# 'space' after the phrases
# in the line on which the split
# is to be done.
l_phrase_2 = '|'.join([
f'(?<={phrase})' for phrase in phrases
if re.search(rf'\b{phrase}\b', line)
])
# Now, we combine the both patterns
# `l_phrase_1` and `l_phrase_2` to
# create our master regex.
result.append(re.split(
rf'\s(?:{l_phrase_1})|(?:{l_phrase_2})\s',
line
))
print(result)
# OUTPUT (PRETTY FORM)
#
# [
# ['hello', 'this is', 'me'],
# ['oh you', 'know', 'u'],
# ['a quick', 'brown fox', 'jumps over the lazy dog']
# ]
Here, I've used re.split
to split before or after phrase in the text.
Clarified with original poster:
Given the text
pack my box with five dozen liquor jugs
and the phrase five dozen
the result should be:
['pack', 'my', 'box', 'with', 'five dozen', 'liquor', 'jugs']
not:
['pack my box with', 'five dozen', 'liquor jugs']
Each text and phrase is converted to a Python list of words ['this', 'is', 'an', 'example']
which prevents 'u' being matched inside a word.
All possible subphrases of the text are generated by compile_subphrases()
.
Longer phrases (more words) are generated first so they are matched before shorter ones. 'five dozen jugs'
would always be matched in preference to 'five dozen'
or 'five'
.
Phrase and subphrase are compared using list slices, roughly like this:
text = ['five', 'dozen', 'liquor', 'jugs']
phrase = ['liquor', 'jugs']
if text[2:3] == phrase:
print('matched')
Using this method for comparing phrases, the script walks through the original text, rewriting it with the phrases picked out.
texts = [['hello this is me'], ['oh you know u']]
phrases_to_match = [['this is', 'u'], ['oh you', 'me']]
from itertools import chain
def flatten(list_of_lists):
return list(chain(*list_of_lists))
def compile_subphrases(text, minwords=1, include_self=True):
words = text.split()
text_length = len(words)
max_phrase_length = text_length if include_self else text_length - 1
# NOTE: longest phrases first
for phrase_length in range(max_phrase_length + 1, minwords - 1, -1):
n_length_phrases = (' '.join(words[r:r + phrase_length])
for r in range(text_length - phrase_length + 1))
yield from n_length_phrases
def match_sublist(mainlist, sublist, i):
if i + len(sublist) > len(mainlist):
return False
return sublist == mainlist[i:i + len(sublist)]
phrases_to_match = list(flatten(phrases_to_match))
texts = list(flatten(texts))
results = []
for raw_text in texts:
print(f"Raw text: '{raw_text}'")
matched_phrases = [
subphrase.split()
for subphrase
in compile_subphrases(raw_text)
if subphrase in phrases_to_match
]
phrasal_text = []
index = 0
text_words = raw_text.split()
while index < len(text_words):
for matched_phrase in matched_phrases:
if match_sublist(text_words, matched_phrase, index):
phrasal_text.append(' '.join(matched_phrase))
index += len(matched_phrase)
break
else:
phrasal_text.append(text_words[index])
index += 1
results.append(phrasal_text)
print(f'Phrases to match: {phrases_to_match}')
print(f"Results: {results}")
Results:
$python3 main.py
Raw text: 'hello this is me'
Raw text: 'oh you know u'
Phrases to match: ['this is', 'u', 'oh you', 'me']
Results: [['hello', 'this is', 'me'], ['oh you', 'know', 'u']]
For testing this and other answers with larger datasets, try this at the start of the code. It generates 100s of variations on a single long sentence to simulate 100s of texts.
from itertools import chain, combinations
import random
#texts = [['hello this is me'], ['oh you know u']]
theme = ' '.join([
'pack my box with five dozen liquor jugs said',
'the quick brown fox as he jumped over the lazy dog'
])
variations = list([
' '.join(combination)
for combination
in combinations(theme.split(), 5)
])
texts = random.choices(variations, k=500)
#phrases_to_match = [['this is', 'u'], ['oh you', 'me']]
phrases_to_match = [
['pack my box', 'quick brown', 'the quick', 'brown fox'],
['jumped over', 'lazy dog'],
['five dozen', 'liquor', 'jugs']
]
This uses Python's best-in-class list slicing. phrase[::2]
creates a list slice consisting of the 0th, 2nd, 4th, 6th... elements of a list. This is the basis of the following solution.
For each phrase, a |
symbol is put either side of found phrases. The following shows 'this is'
being marked in 'hello this is me'
'hello this is me' -> 'hello|this is|me'
When the text is split on |
:
['hello', 'this is', 'me']
the even-numbered elements [::2]
are non-matches, the odd elements [1::2]
are the matched phrases:
0 1 2
unmatched: ['hello', 'me']
matched: 'this is',
If there are different numbers of matched and unmatched elements in the segment, the gaps are filled with empty strings using zip_longest
so that there is always a balanced pair of unmatched and matched text:
0 1 2 3
unmatched: ['hello', 'me', ]
matched: 'this is', ''
For each phrase, the previously unmatched (even-numbered) elements of the text are scanned, the phrase (if found) delimited with |
and the results merged back into the segmented text.
The matched and unmatched segments are merged back into the segmented text using zip()
followed by flatten()
, taking care to maintain the even (unmatched) and odd (matched) indexes of new and existing text segments. The newly-matched phrases are merged back in as odd-numbered elements, so they will not be scanned again for embedded phrases. This prevents conflict between phrases with similar wording like "this is" and "this".
flatten()
is used everywhere. It finds sub-lists embedded in a larger list and flattens their contents down into the main list:
['outer list 1', ['inner list 1', 'inner list 2'], 'outer list 2']
becomes:
['outer list 1', 'inner list 1', 'inner list 2', 'outer list 2']
This is useful for collecting phrases from multiple embedded lists, as well as merging split or zipped sublists back into the segmented text:
[['the quick brown fox says', ''], ['hello', 'this is', 'me', '']] ->
['the quick brown fox says', '', 'hello', 'this is', 'me', ''] ->
0 1 2 3 4 5
unmatched: ['the quick brown fox says', 'hello', 'me', ]
matched: '', 'this is', '',
At the very end, the elements that are empty strings, which were just for even-odd alignment, can be removed:
['the quick brown fox says', '', 'hello', 'this is', '', 'me', ''] ->
['the quick brown fox says', 'hello', 'this is', 'me']
texts = [['hello this is me'], ['oh you know u'],
['the quick brown fox says hello this is me']]
phrases_to_match = [['this is', 'u'], ['oh you', 'you', 'me']]
from itertools import zip_longest
def flatten(string_list):
flat = []
for el in string_list:
if isinstance(el, list) or isinstance(el, tuple):
flat.extend(el)
else:
flat.append(el)
return flat
phrases_to_match = flatten(phrases_to_match)
# longer phrases are given priority to avoid problems with overlapping
phrases_to_match.sort(key=lambda phrase: -len(phrase.split()))
segmented_texts = []
for text in flatten(texts):
segmented_text = text.split('|')
for phrase in phrases_to_match:
new_segments = segmented_text[::2]
delimited_phrase = f'|{phrase}|'
for match in [f' {phrase} ', f' {phrase}', f'{phrase} ']:
new_segments = [
segment.replace(match, delimited_phrase)
for segment
in new_segments
]
new_segments = flatten([segment.split('|') for segment in new_segments])
segmented_text = new_segments if len(segmented_text) == 1 else \
flatten(zip_longest(new_segments, segmented_text[1::2], fillvalue=''))
segmented_text = [segment for segment in segmented_text if segment.strip()]
# option 1: unmatched text is split into words
segmented_text = flatten([
segment if segment in phrases_to_match else segment.split()
for segment
in segmented_text
])
segmented_texts.append(segmented_text)
print(segmented_texts)
Results:
[['hello', 'this is', 'me'], ['oh you', 'know', 'u'],
['the', 'quick', 'brown', 'fox', 'says', 'hello', 'this is', 'me']]
Notice that the phrase 'oh you' has taken precedence over the subset phrase 'you' and there is no conflict.