-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathW2vec.py
More file actions
47 lines (34 loc) · 1.2 KB
/
W2vec.py
File metadata and controls
47 lines (34 loc) · 1.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 25 16:36:10 2021
@author: Aditi.Dhamat
"""
import bs4 as bs
import urllib.request
import re
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
#Getting the data(Scraping- parsing articles)
source = urllib.request.urlopen("https://en.wikipedia.org/wiki/Climate_change").read()
soup = bs.BeautifulSoup(source, 'lxml')
text = ""
for paragraph in soup.find_all('p'): #p tag of wikipedia as it uses paragraphs
text += paragraph.text
#Preprocessing the text
text = re.sub(r"\[[0-9]*\]", " ", text)
text = re.sub(r"\s+", " ", text)
text = text.lower()
text = re.sub(r'[@#\$%&\*\(\)\<\>\?\'\":;\]\[-]', ' ', text)
text = re.sub(r'\d', ' ', text)
text = re.sub(r'\s+', ' ', text)
#Preparing the data
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
for i in range(len(sentences)):
sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]
#Training Wrod2Vec model
model = Word2Vec(sentences, min_count=1)
words = model.wv.vocab
vector = model.wv['global']
similar = model.wv.most_similar('warming')