Example 1: tfidfvectorizer code
# TF-IDF vectorizer >>> Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
Vec = vectorizer.fit_transform(df['text_column_name_after_preprocessing'])
print(vectorizer.get_feature_names())
X = df.drop('column_name', axis = 1)
y = df["Column_name"].values
#train test split:>>>>>>>>>>>
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X, y, test_size=0.2, random_state=2020)
model_logit_tf = LogisticRegression(class_weight="balanced",solver='saga', max_iter=100)
model_logit_tf.fit(X_train_tfidf, y_train_tfidf) # fit the model
y_pred_tfidf = model_logit_tf.predict(X_test_tfidf) # prediction
#F1 score:>>>>>>>>>
f1score_TF = f1_score(y_test_tfidf, y_pred_tfidf, average='micro')
print(f"TF-IDF Model F1 Score for Logistic Regression: {f1score_TF * 100} %")
Rcall score:>>>>>>>>>
recall_score_TF = recall_score(y_test_tfidf, model_logit_tf.predict(X_test_tfidf), average = 'macro')
print(f"TF-IDF Model Recall Score for Logistic Regression: {recall_score_TF * 100} %")
precision score:>>>>>>>>>
precision_score_TF = precision_score(y_test_tfidf, model_logit_tf.predict(X_test_tfidf), average = 'macro')
print(f"TF-IDF Model Precision Score for Logistic Regression: {precision_score_TF * 100} %")
Example 2: tf-idf weghting toa a word of vocabulary in scikitlearn?
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> from sklearn.pipeline import Pipeline
>>> import numpy as np
>>> corpus = ['this is the first document',
... 'this document is the second document',
... 'and this is the third one',
... 'is this the first document']
>>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
... 'and', 'one']
>>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
... ('tfid', TfidfTransformer())]).fit(corpus)
>>> pipe['count'].transform(corpus).toarray()
array([[1, 1, 1, 1, 0, 1, 0, 0],
[1, 2, 0, 1, 1, 1, 0, 0],
[1, 0, 0, 1, 0, 1, 1, 1],
[1, 1, 1, 1, 0, 1, 0, 0]])
>>> pipe['tfid'].idf_
array([1. , 1.22314355, 1.51082562, 1. , 1.91629073,
1. , 1.91629073, 1.91629073])
>>> pipe.transform(corpus).shape
(4, 8)