TfidfVectorizer code example

Example 1: tfidfvectorizer code

# TF-IDF vectorizer >>> Logistic Regression

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
Vec = vectorizer.fit_transform(df['text_column_name_after_preprocessing'])
print(vectorizer.get_feature_names())

X = df.drop('column_name', axis = 1)
y = df["Column_name"].values

#train test split:>>>>>>>>>>>
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X, y, test_size=0.2, random_state=2020)

model_logit_tf = LogisticRegression(class_weight="balanced",solver='saga', max_iter=100)
model_logit_tf.fit(X_train_tfidf, y_train_tfidf) # fit the model 

y_pred_tfidf = model_logit_tf.predict(X_test_tfidf) # prediction

#F1 score:>>>>>>>>>
f1score_TF = f1_score(y_test_tfidf, y_pred_tfidf, average='micro')
print(f"TF-IDF Model F1 Score for Logistic Regression: {f1score_TF * 100} %")

Rcall score:>>>>>>>>>
recall_score_TF = recall_score(y_test_tfidf, model_logit_tf.predict(X_test_tfidf), average = 'macro')
print(f"TF-IDF Model Recall Score for Logistic Regression: {recall_score_TF * 100} %")

precision score:>>>>>>>>>
precision_score_TF = precision_score(y_test_tfidf, model_logit_tf.predict(X_test_tfidf), average = 'macro')
print(f"TF-IDF Model Precision Score for Logistic Regression: {precision_score_TF * 100} %")

Example 2: tf-idf weghting toa a word of vocabulary in scikitlearn?

>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> from sklearn.pipeline import Pipeline
>>> import numpy as np
>>> corpus = ['this is the first document',
...           'this document is the second document',
...           'and this is the third one',
...           'is this the first document']
>>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
...               'and', 'one']
>>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
...                  ('tfid', TfidfTransformer())]).fit(corpus)
>>> pipe['count'].transform(corpus).toarray()
array([[1, 1, 1, 1, 0, 1, 0, 0],
       [1, 2, 0, 1, 1, 1, 0, 0],
       [1, 0, 0, 1, 0, 1, 1, 1],
       [1, 1, 1, 1, 0, 1, 0, 0]])
>>> pipe['tfid'].idf_
array([1.        , 1.22314355, 1.51082562, 1.        , 1.91629073,
       1.        , 1.91629073, 1.91629073])
>>> pipe.transform(corpus).shape
(4, 8)