Example: implements both tokenization and occurrence

# implements both tokenization and occurrence

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
# CountVectorizer()

corpus = [
  'This is the first document.',
  'This is the second second document.',
  'And the third one.',
  'Is this the first document?',
X = vectorizer.fit_transform(corpus)
# <4x9 sparse matrix of type '<... 'numpy.int64'>'
#     with 19 stored elements in Compressed Sparse ... format>

analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (
  ['this', 'is', 'text', 'document', 'to', 'analyze'])
# True

vectorizer.get_feature_names() == (
  ['and', 'document', 'first', 'is', 'one',
   'second', 'the', 'third', 'this'])
# True

# array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
#        [0, 1, 0, 1, 0, 2, 1, 0, 1],
#        [1, 0, 0, 0, 1, 0, 1, 1, 0],
#        [0, 1, 1, 1, 0, 0, 1, 0, 1]]...)

# 1

vectorizer.transform(['Something completely new.']).toaaray()
# array([[0, 0, 0, 0, 0, 0, 0, 0, 0]]...)

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                    token_pattern=r'\b\w+b', min_df=1)
analyze = bigram('Bi-grams are cool!') == (
  ['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])
# True

X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
# array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
#        [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
#        [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
#        [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]]...)

feature_index = bigram_vectorizer.vocabulary_.get('is this')
X_2[:, feature_index]
# array([0, 0, 0, 1]...)


