-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtxt minnig and basic NLP.py
More file actions
177 lines (118 loc) · 4.44 KB
/
txt minnig and basic NLP.py
File metadata and controls
177 lines (118 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
'''Text Minnig and NLP'''
Sentence = 'We are Learning TextMining'
'TextMining' in Sentence
Sentence.index('Learning')
Sentence.split().index('TextMining')
Sentence.split()[2]
Sentence.split()[2][::-1]
word = Sentence.split()
first_word = word[0]
last_word = word[len(word)-1]
concat_word = first_word + last_word
print(concat_word)
[word[i] for i in range(len(word)) if i%2 == 0]
Sentence[-3:]
Sentence[::-1]
print(' '.join([word[::-1] for word in word]))
#word tokenization
import nltk
from nltk import word_tokenize
nlp_phrase = 'I am learning Natural Lanhuage Processing Fundamental for not only job interview'
print(nlp_phrase)
nltk.pos_tag(nlp_phrase) # Parts of Speech Tagging
#nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords') # Stop Words from nltk library
from nltk.corpus import stopwords
stop_words = stopwords.words('English')
sentence1 = "I am learning NLP. It is one of the most popular library in Python"
sentence1_words = word_tokenize(sentence1) # Tokenize the sentence
print(sentence1_words)
#nltk.download('punkt')
# Filtering stop words from the input string
sentence_no_stops = ' '.join([word for word in sentence1_words if word not in stop_words])
print(sentence_no_stops)
#replace words in string
sentence2 = "I will get MY job before 14-02-2022"
normalized_sentence = sentence2.replace('14-02-2022','31-01-2022')
print(normalized_sentence)
#pip install autocorrect
from autocorrect import Speller
spell = Speller(lang='en')
spell('Natureal') # Correct spelling is printed
sentence3 = word_tokenize("Ntural Luaguage Prcessin dealss with the art of extracting insightes from Natural Languaes")
sentence_corrected = ' '.join([spell(word) for word in sentence3])
#stemming
stemmer = nltk.stem.PorterStemmer()
stemmer.stem("Programming")
stemmer.stem("Jumping")
stemmer.stem("Jumper")
stemmer.stem("battling") # battl - stemming does not look into dictionary words
stemmer.stem("amazing")
# Lemmatization
# Lemmatization looks into dictionary words
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
Lemmatization = WordNetLemmatizer()
Lemmatization.lemmatize('Programming')
Lemmatization.lemmatize('Programs')
Lemmatization.lemmatize('battling')
Lemmatization.lemmatize("amazing")
# Chunking (Shallow Parsing) - Identifying named entities
nltk.download('maxent_ne_chunker')
nltk.download('words')
sentence4 = 'We are learning nlp in Python by 360DigiTMG which is based out of India.'
a =nltk.pos_tag(word_tokenize(sentence4))
i = nltk.ne_chunk(a,binary = False)
#i = nltk.ne_chunk(nltk.pos_tag(word_tokenize(sentence4)), binary=True)
[a for a in i if len(a)==1]
[onj for onj in i if len(onj)==1]
#sentence tokenization
from nltk.tokenize import sent_tokenize
sent_tokenize("We are learning NLP in Python. Which enable me to help this society.")
from nltk.wsd import lesk #Word sense disambiguation
#In general, simple_lesk() from pywsd does better than lesk from NLTK
sentence5 = "Keep your savings in the bank"
print(lesk(word_tokenize(sentence1), 'bank'))
sentence6 = "It's so risky to drive over the banks of the river"
print(lesk(word_tokenize(sentence2), 'bank'))
# "bank" as multiple meanings.
# The definitions for "bank" can be seen here:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bank'): print(ss, ss.definition())
###################################################################################
'''1. CC Coordinating conjunction
2. CD Cardinal number
3. DT Determiner
4. EX Existential there
5. FW Foreign word
6. IN Preposition or subordinating conjunction
7. JJ Adjective
8. JJR Adjective, comparative
9. JJS Adjective, superlative
10. LS List item marker
11. MD Modal
12. NN Noun, singular or mass
13. NNS Noun, plural
14. NNP Proper noun, singular
15. NNPS Proper noun, plural
16. PDT Predeterminer
17. POS Possessive ending
18. PRP Personal pronoun
19. PRP$ Possessive pronoun
20. RB Adverb
21. RBR Adverb, comparative
22. RBS Adverb, superlative
23. RP Particle
24. SYM Symbol
25. TO to
26. UH Interjection
27. VB Verb, base form
28. VBD Verb, past tense
29. VBG Verb, gerund or present participle
30. VBN Verb, past participle
31. VBP Verb, non-3rd person singular present
32. VBZ Verb, 3rd person singular present
33. WDT Wh-determiner
34. WP Wh-pronoun
35. WP$ Possessive wh-pronoun
36. WRB Wh-adverb'''