-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTF_IDF.py
More file actions
37 lines (26 loc) · 1.63 KB
/
TF_IDF.py
File metadata and controls
37 lines (26 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import nltk
paragraph = """Natural language processing (NLP) is a fascinating field that combines linguistics, computer science, and artificial intelligence to enable computers to understand, interpret, and generate human language. It involves a variety of techniques and models to process text and speech data, making it possible for machines to perform tasks
such as language translation, sentiment analysis, text summarization, and question answering. NLP is widely used in applications like chatbots, virtual assistants, and search engines, helping to bridge the gap between human communication and computer understanding. As the amount of digital text and spoken data continues to grow, the importance of NLP in extracting meaningful information and facilitating human-computer interaction becomes even more significant."""
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet =WordNetLemmatizer()
sentence = nltk.sent_tokenize(paragraph)
print(len(sentence))
corpus=[]
for i in range(len(sentence)):
review=re.sub('[^a-zA-Z]',' ',sentence[i])
print("removing the (commos full stop)",review)
review=review.lower()
print("lowercase",review)
review=review.split()
print("split the words ",review)
review=[wordnet.lemmatize(word) for word in review if not word in set(stopwords.words("english"))]
# Join the words back into a string for CountVectorizer
review_text = ' '.join(review)
corpus.append(review_text)
print("review",review_text)
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
x=cv.fit_transform(corpus).toarray()
print(x)