vectorization nlp scikt-learn code example
Example: countvectorizer in nlp
from sklearn.datasets import fetch_20newsgroupsfrom sklearn.feature_extraction.text import CountVectorizerimport numpy as np# Create our vectorizervectorizer = CountVectorizer()# Let's fetch all the possible text datanewsgroups_data = fetch_20newsgroups()# Why not inspect a sample of the text data?print('Sample 0: ')print(newsgroups_data.data[0])print()# Create the vectorizervectorizer.fit(newsgroups_data.data)# Let's look at the vocabulary:print('Vocabulary: ')print(vectorizer.vocabulary_)print()# Converting our first sample into a vectorv0 = vectorizer.transform([newsgroups_data.data[0]]).toarray()[0]print('Sample 0 (vectorized): ')print(v0)print()# It's too big to even see...# What's the length?print('Sample 0 (vectorized) length: ')print(len(v0))print()# How many words does it have?print('Sample 0 (vectorized) sum: ')print(np.sum(v0))print()# What if we wanted to go back to the source?print('To the source:')print(vectorizer.inverse_transform(v0))print()# So all this data has a lot of extra garbage... Why not strip it away?newsgroups_data = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))# Why not inspect a sample of the text data?print('Sample 0: ')print(newsgroups_data.data[0])print()# Create the vectorizervectorizer.fit(newsgroups_data.data)# Let's look at the vocabulary:print('Vocabulary: ')print(vectorizer.vocabulary_)print()# Converting our first sample into a vectorv0 = vectorizer.transform([newsgroups_data.data[0]]).toarray()[0]print('Sample 0 (vectorized): ')print(v0)print()# It's too big to even see...# What's the length?print('Sample 0 (vectorized) length: ')print(len(v0))print()# How many words does it have?print('Sample 0 (vectorized) sum: ')print(np.sum(v0))print()# What if we wanted to go back to the source?print('To the source:')print(vectorizer.inverse_transform(v0))print()