-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathhelpers.py
More file actions
executable file
·71 lines (60 loc) · 2.4 KB
/
helpers.py
File metadata and controls
executable file
·71 lines (60 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
from website.models import Question
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
sw = stopwords.words('english')
def get_video_info(path):
"""Uses ffmpeg to determine information about a video. This has not been broadly
tested and your milage may vary"""
from decimal import Decimal
import subprocess
import re
process = subprocess.Popen(['/usr/bin/ffmpeg', '-i', path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = process.communicate()
duration_m = re.search(r"Duration:\s{1}(?P<hours>\d+?):(?P<minutes>\d+?):(?P<seconds>\d+\.\d+?)", stdout.decode("UTF-8"), re.DOTALL).groupdict()
info_m = re.search(r": Video: (?P<codec>.*?), (?P<profile>.*?), (?P<width>.*?)x(?P<height>.*?), ", stdout.decode("UTF-8"), re.DOTALL).groupdict()
hours = Decimal(duration_m['hours'])
minutes = Decimal(duration_m['minutes'])
seconds = Decimal(duration_m['seconds'])
total = 0
total += 60 * 60 * hours
total += 60 * minutes
total += seconds
info_m['hours'] = hours
info_m['minutes'] = minutes
info_m['seconds'] = seconds
info_m['duration'] = total
return info_m
def prettify(string):
string = string.lower()
string = string.replace('-', ' ')
string = string.strip()
string = string.replace(' ', '-')
string = re.sub('[^A-Za-z0-9\-]+', '', string)
string = re.sub('-+', '-', string)
return string
def pre_process(text):
text=text.lower() # lowercase
text = re.sub('<.*?>', ' ', text).replace(' ',' ')
text=re.sub("<!--?.*?-->"," ",text).replace(' ',' ') # remove tags
text=re.sub("(\\d|\\W)+"," ",text).replace(' ',' ') # remove special characters and digits
return text
def clean_user_data(text):
words = word_tokenize(pre_process(text.lower()))
clean_list = [w for w in words if not w in sw]
return clean_list
def get_similar_questions(user_ques,question):
total = []
l1 = []
l2 = []
question = word_tokenize(pre_process(question))
question =[w for w in question if not w in sw]
total = user_ques + question
for w in total:
if w in user_ques: l1.append(1) # create a vector
else: l1.append(0)
if w in question: l2.append(1)
else: l2.append(0)
cs = cosine_similarity((l1,l2))
return cs[0][1]