Custom sentence segmentation in Spacy
The following code works:
import spacy
nlp = spacy.load('en_core_web_sm')
def split_on_breaks(doc):
start = 0
seen_break = False
for word in doc:
if seen_break:
yield doc[start:word.i-1]
start = word.i
seen_break = False
elif word.text == '@SentBoundary@':
seen_break = True
if start < len(doc):
yield doc[start:len(doc)]
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_breaks)
nlp.add_pipe(sbd, first=True)
def get_sentences(text):
doc = nlp(text)
return (list(doc.sents)) # convert to string if required.
# Ex1
get_sentences("Bob meets Alice. @SentBoundary@ They play together.")
# => ["Bob meets Alice.", "They play together."] # two sentences
# Ex2
get_sentences("Bob meets Alice. They play together.")
# => ["Bob meets Alice. They play together."] # ONE sentence
# Ex3
get_sentences("Bob meets Alice, @SentBoundary@ they play together.")
# => ["Bob meets Alice,", "they play together."] # two sentences
Right thing was to check for SentenceSegmenter than manual boundary setting (examples here). This github issue was also helpful.