CountVectorizer code example
Example 1: countvectorizer with list of list
corpus = [["this is spam, 'SPAM'"],["this is ham, 'HAM'"],["this is nothing, 'NOTHING'"]]
from sklearn.feature_extraction.text import CountVectorizer
bag_of_words = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(splited_labels_from_corpus)
Example 2: tf-idf weghting toa a word of vocabulary in scikitlearn?
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> from sklearn.pipeline import Pipeline
>>> import numpy as np
>>> corpus = ['this is the first document',
... 'this document is the second document',
... 'and this is the third one',
... 'is this the first document']
>>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
... 'and', 'one']
>>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
... ('tfid', TfidfTransformer())]).fit(corpus)
>>> pipe['count'].transform(corpus).toarray()
array([[1, 1, 1, 1, 0, 1, 0, 0],
[1, 2, 0, 1, 1, 1, 0, 0],
[1, 0, 0, 1, 0, 1, 1, 1],
[1, 1, 1, 1, 0, 1, 0, 0]])
>>> pipe['tfid'].idf_
array([1. , 1.22314355, 1.51082562, 1. , 1.91629073,
1. , 1.91629073, 1.91629073])
>>> pipe.transform(corpus).shape
(4, 8)
Example 3: countvectorizer in nlp
from sklearn.datasets import fetch_20newsgroupsfrom sklearn.feature_extraction.text import CountVectorizerimport numpy as np# Create our vectorizervectorizer = CountVectorizer()# Let's fetch all the possible text datanewsgroups_data = fetch_20newsgroups()# Why not inspect a sample of the text data?print('Sample 0: ')print(newsgroups_data.data[0])print()# Create the vectorizervectorizer.fit(newsgroups_data.data)# Let's look at the vocabulary:print('Vocabulary: ')print(vectorizer.vocabulary_)print()# Converting our first sample into a vectorv0 = vectorizer.transform([newsgroups_data.data[0]]).toarray()[0]print('Sample 0 (vectorized): ')print(v0)print()# It's too big to even see...# What's the length?print('Sample 0 (vectorized) length: ')print(len(v0))print()# How many words does it have?print('Sample 0 (vectorized) sum: ')print(np.sum(v0))print()# What if we wanted to go back to the source?print('To the source:')print(vectorizer.inverse_transform(v0))print()# So all this data has a lot of extra garbage... Why not strip it away?newsgroups_data = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))# Why not inspect a sample of the text data?print('Sample 0: ')print(newsgroups_data.data[0])print()# Create the vectorizervectorizer.fit(newsgroups_data.data)# Let's look at the vocabulary:print('Vocabulary: ')print(vectorizer.vocabulary_)print()# Converting our first sample into a vectorv0 = vectorizer.transform([newsgroups_data.data[0]]).toarray()[0]print('Sample 0 (vectorized): ')print(v0)print()# It's too big to even see...# What's the length?print('Sample 0 (vectorized) length: ')print(len(v0))print()# How many words does it have?print('Sample 0 (vectorized) sum: ')print(np.sum(v0))print()# What if we wanted to go back to the source?print('To the source:')print(vectorizer.inverse_transform(v0))print()
Example 4: CountVectorizer
vectorizer2.get_feature_names()