-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathlearntopbigrams.jl
More file actions
75 lines (66 loc) · 2.24 KB
/
learntopbigrams.jl
File metadata and controls
75 lines (66 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
include("learnlib.jl")
function run_country_gda(trainXc, trainTc)
println("Training country gda")
model = LinearDiscriminantAnalysis(priors=[0.1 for x in 1:10])
fit!(model, trainXc, trainTc)
model
end
function run_rank_gda(trainX, trainY)
println("Training rank gda")
model = LinearDiscriminantAnalysis(priors=[0.1 for x in 1:10])
fit!(model, trainX, ratingstoranks(trainY))
model
end
contestid = 1023
println("Loading data ", contestid)
objs = JLD.load(@sprintf("cs229_project/data2-tokens-big-2gram-cv-%s.jld", contestid));
trainX = objs["trainX"];
trainXv1 = objs["trainXv1"];
trainY = objs["trainY"];
trainT = objs["trainT"];
testX = objs["testX"];
testXv1 = objs["testXv1"];
testY = objs["testY"];
testT = objs["testT"];
vocab = objs["vocab"];
weights = getweights(trainY);
weights2 = getweights2(trainY);
trainXc = trainX[trainT .!= -1, :];
trainTc = trainT[trainT .!= -1];
testXc = testX[testT .!= -1, :];
testTc = testT[testT .!= -1];
weightscountry = getweightscountry(trainTc)
objs = nothing;
@printf("Main set: %d train %d test\n", size(trainX, 1), size(testX, 1))
@printf("Country set: %d train %d test\n", size(trainXc, 1), size(testXc, 1))
#rank_tf = JLD.load(@sprintf("cs229_project/data2-tokens-big-2gram-cv-%s-tf-rank.jld", contestid))
#
#diff = rank_tf["tf_logit"]["W"][:,2]# - rank_tf["tf_logit"]["W"][:,9]
#diff = [(diff, vocab[idx], idx) for (idx, diff) in enumerate(diff)]
#sort!(diff, rev=true)
#println(diff[1:20])
#
#diff = rank_tf["tf_logit"]["W"][:,9]# - rank_tf["tf_logit"]["W"][:,2]
#diff = [(diff, vocab[idx], idx) for (idx, diff) in enumerate(diff)]
#sort!(diff, rev=true)
#println(diff[1:20])
#rank_gda = run_rank_gda(trainX, trainY)
country_gda = run_country_gda(trainXc, trainTc)
n = size(trainX, 2)
for i in 1:10
diff = country_gda[:means_][i,:]
diff = [(diff, vocab[idx], idx) for (idx, diff) in enumerate(diff)]
sort!(diff, rev=true)
println(COUNTRIES[i])
println(diff[1:20])
println()
end
#diff = rank_gda[:means_][2,:] - rank_gda[:means_][2,:]
#diff = [(diff, vocab[idx], idx) for (idx, diff) in enumerate(diff)]
#sort!(diff, rev=true)
#println(diff[1:20])
#
#diff = rank_gda[:means_][9,:] - rank_gda[:means_][2,:]
#diff = [(diff, vocab[idx], idx) for (idx, diff) in enumerate(diff)]
#sort!(diff, rev=true)
#println(diff[1:20])