spoken-tutorial-forum/website/helpers.py at 3246f5c377bb2e4a6f90e1a7e08cded577ab0fb1 · Spoken-tutorial/spoken-tutorial-forum · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
from website.models import Question
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
sw = stopwords.words('english')

def get_video_info(path):
    """Uses ffmpeg to determine information about a video. This has not been broadly
    tested and your milage may vary"""

    from decimal import Decimal
    import subprocess
    import re

    process = subprocess.Popen(['/usr/bin/ffmpeg', '-i', path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    stdout, stderr = process.communicate()
    duration_m = re.search(r"Duration:\s{1}(?P<hours>\d+?):(?P<minutes>\d+?):(?P<seconds>\d+\.\d+?)", stdout.decode("UTF-8"), re.DOTALL).groupdict()
    info_m = re.search(r": Video: (?P<codec>.*?), (?P<profile>.*?), (?P<width>.*?)x(?P<height>.*?), ", stdout.decode("UTF-8"), re.DOTALL).groupdict()
    hours = Decimal(duration_m['hours'])
    minutes = Decimal(duration_m['minutes'])
    seconds = Decimal(duration_m['seconds'])

    total = 0
    total += 60 * 60 * hours
    total += 60 * minutes
    total += seconds

    info_m['hours'] = hours
    info_m['minutes'] = minutes
    info_m['seconds'] = seconds
    info_m['duration'] = total
    return info_m


def prettify(string):
    string = string.lower()
    string = string.replace('-', ' ')
    string = string.strip()
    string = string.replace(' ', '-')
    string = re.sub('[^A-Za-z0-9\-]+', '', string)
    string = re.sub('-+', '-', string)
    return string


def pre_process(text):
    text=text.lower()                       # lowercase
    text = re.sub('<.*?>', ' ', text).replace('  ',' ')
    text=re.sub("<!--?.*?-->"," ",text).replace('  ',' ')      # remove tags
    text=re.sub("(\\d|\\W)+"," ",text).replace('  ',' ')      # remove special characters and digits
    return text

def clean_user_data(text):
    words = word_tokenize(pre_process(text.lower()))
    clean_list = [w for w in words if not w in sw]
    return clean_list

def get_similar_questions(user_ques,question):
    total = []
    l1 = []
    l2 = []
    question = word_tokenize(pre_process(question))
    question =[w for w in question if not w in sw]
    total = user_ques + question
    for w in total:
        if w in user_ques: l1.append(1) # create a vector
        else: l1.append(0)
        if w in question: l2.append(1)
        else: l2.append(0)
    cs = cosine_similarity((l1,l2))
    return cs[0][1]