forked from simon-weber/Programming-Language-Identification
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathidentifylanguage.py
More file actions
executable file
·132 lines (112 loc) · 4.08 KB
/
identifylanguage.py
File metadata and controls
executable file
·132 lines (112 loc) · 4.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#Copyright (c) 2011 David Klein and Simon Weber
#Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php
import sys
from identifytraits import *
import commentIdentify
#gets a list of all the languages known so far
def getLanguages():
try:
#open file
language_file = open('languagesknown.txt', 'r+')
except:
language_file = open('languagesknown.txt', 'w')
language_file.close()
return []
#read all known languages into array
languages = []
s = language_file.readline()
while s != '':
if s.strip() != '':
languages.append(s.strip())
s = language_file.readline()
language_file.close()
return languages
#takes all the individual scores and turns them into a final guess
def combineScores(list_of_scores, languages, showIndividualScores=False):
#outputfile = open("output.txt", "w")
finalTally = []
for lang in languages:
finalTally.append([0, lang])
for j in list_of_scores.items():
if showIndividualScores:
#outputfile.write("________" + str(j[0]) + "________\n")
print "________" + str(j[0]) + "________"
for i in j[1].items():
if showIndividualScores:
#outputfile.write(str(i[0]) + ":" + " " * (10 - len(str(i[0]))) + str(int(i[1] * 100)) + "%\n")
print str(i[0]) + ":" + " " * (10 - len(str(i[0]))) + str(int(i[1] * 100)) + "%"
for k in range(len(finalTally)):
if i[0] == finalTally[k][1]:
finalTally[k][0] += i[1] * 100
finalTally.sort()
for i in range(min(len(finalTally), 5)):
#outputfile.write(str(i+1) + ". " + str(finalTally[len(finalTally)-i-1][1]) + " - " + str(int(finalTally[len(finalTally)-i-1][0]*100)) + "\n")
print str(i+1) + ". " + str(finalTally[len(finalTally)-i-1][1]) + " - " + str(int(finalTally[len(finalTally)-i-1][0]*100))
def stripCommentsAndStrings(source):
result = commentIdentify.guessTokens(source)
tokens = {}
for start, end in result[0]:
tokens[start] = end
# for each line, we look to see if it begins with a comment start token
for line in range(len(source)):
processedLine = source[line].strip().split(" ")
if processedLine == []:
continue
if tokens.has_key(processedLine[0]):
i = line-1
#now we look for the end token
while i < len(source):
i += 1
endToken = tokens[processedLine[0]]
loc = source[i].find(endToken)
#remove a line if the end token is not found, otherwise end the loop
if loc == -1:
source[i] = ""
continue
else:
source[i] = source[i][loc:]
i = len(source)
break
tokens = []
for i in result[1]:
tokens.append(i)
# for each line, we look to see if it begins with a comment start token
for line in range(len(source)):
processedLine = source[line].strip().split(" ")
if processedLine == []:
continue
for tok in tokens:
if source[line].find(tok) != -1:
source[line] = source[line][:source[line].find(tok)]
tokens = []
for i in result[2]:
tokens.append(i)
# for each line, we look to see if it begins with a comment start token
for line in range(len(source)):
if tokens == []:
break
for tok in tokens:
startLoc = source[line].find(tok)
while startLoc != -1:
endLoc = source[line].rfind(tok)
#if the token is common enough to be a likely candidate, delete the string
source[line] = source[line][:startLoc] + source[line][endLoc+1:]
startLoc = source[line].find(tok)
return source
def main():
languages = getLanguages()
source = sys.stdin.readlines()
list_of_scores = {}
list_of_scores["commentsAndStrings"] = identifyCommentAndString(languages, source)
source = stripCommentsAndStrings(source)
list_of_scores["lastCharacter"] = identifyLastCharacter(languages, source)
list_of_scores["firstWord"] = identifyFirstWord(languages, source)
list_of_scores["operator"] = identifyOperator(languages, source)
list_of_scores["brackets"] = identifyBrackets(languages, source)
list_of_scores["keywords"] = identifyKeywords(languages, source)
list_of_scores["punctuation"] = identifyPunctuation(languages, source)
if len(sys.argv) == 2 and sys.argv[1] == "-v":
combineScores(list_of_scores, languages, True)
else:
combineScores(list_of_scores, languages)
main()