How to split an NLP parse tree to clauses (independent and subordinate)?
You can use Tree.subtrees()
. For more information check NLTK Tree Class.
Code:
from nltk import Tree
parse_str = "(ROOT (S (NP (PRP You)) (VP (MD could) (VP (VB say) (SBAR (IN that) (S (NP (PRP they)) (ADVP (RB regularly)) (VP (VB catch) (NP (NP (DT a) (NN shower)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBZ adds) (PP (TO to) (NP (NP (PRP$ their) (NN exhilaration)) (CC and) (NP (FW joie) (FW de) (FW vivre))))))))))))) (. .)))"
#parse_str = "(ROOT (S (SBAR (IN Though) (S (NP (PRP he)) (VP (VBD was) (ADJP (RB very) (JJ rich))))) (, ,) (NP (PRP he)) (VP (VBD was) (ADVP (RB still)) (ADJP (RB very) (JJ unhappy))) (. .)))"
t = Tree.fromstring(parse_str)
#print t
subtexts = []
for subtree in t.subtrees():
if subtree.label()=="S" or subtree.label()=="SBAR":
#print subtree.leaves()
subtexts.append(' '.join(subtree.leaves()))
#print subtexts
presubtexts = subtexts[:] # ADDED IN EDIT for leftover check
for i in reversed(range(len(subtexts)-1)):
subtexts[i] = subtexts[i][0:subtexts[i].index(subtexts[i+1])]
for text in subtexts:
print text
# ADDED IN EDIT - Not sure for generalized cases
leftover = presubtexts[0][presubtexts[0].index(presubtexts[1])+len(presubtexts[1]):]
print leftover
Output:
You could say
that
they regularly catch a shower ,
which
adds to their exhilaration and joie de vivre
.