df=pd.read_csv("dataa.csv")
df.head()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents ='unicode',max_features=n_features,stop_words='english',max_df = 0.5,min_df = 10)
tf = tf_vectorizer.fit_transform(df.content)
ValueError Traceback (most recent call last)
in ()
1 n_features = 1000
2 tf_vectorizer = CountVectorizer(strip_accents ='unicode',max_features=n_features,stop_words='english',max_df = 0.5,min_df = 10)
----> 3 tf = tf_vectorizer.fit_transform(df.content)
/home/wanghan/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
837
838 vocabulary, X = self._count_vocab(raw_documents,
--> 839 self.fixed_vocabulary_)
840
841 if self.binary:
/home/wanghan/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
760 for doc in raw_documents:
761 feature_counter = {}
--> 762 for feature in analyze(doc):
763 try:
764 feature_idx = vocabulary[feature]
/home/wanghan/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in (doc)
239
240 return lambda doc: self._word_ngrams(
--> 241 tokenize(preprocess(self.decode(doc))), stop_words)
242
243 else:
/home/wanghan/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in decode(self, doc)
119
120 if doc is np.nan:
--> 121 raise ValueError("np.nan is an invalid document, expected byte or "
122 "unicode string.")
123
ValueError: np.nan is an invalid document, expected byte or unicode string.