forked from UniversalDependencies/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconllu-stats.py
More file actions
executable file
·140 lines (126 loc) · 6.83 KB
/
conllu-stats.py
File metadata and controls
executable file
·140 lines (126 loc) · 6.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import sys
import re
import file_util
from file_util import FORM,LEMMA,CPOSTAG,FEATS,DEPREL,DEPS #column index for the columns we'll need
import argparse
import os
import codecs
import json
import traceback
THISDIR=os.path.dirname(os.path.abspath(__file__))
class Stats(object):
def __init__(self):
self.token_count=0
self.word_count=0
self.tree_count=0
self.words_with_lemma_count=0
self.words_with_deps_count=0
self.f_val_counter={} #key:f=val value: count
self.deprel_counter={} #key:deprel value: count
def count_cols(self,cols):
if cols[0].isdigit() or u"." in cols[0]: #word or empty word
self.word_count+=1
self.token_count+=1 #every word is also a one-word token
else: #token
b,e=cols[0].split(u"-")
b,e=int(b),int(e)
self.token_count-=e-b #every word is counted as a token, so subtract all but one to offset for that
if cols[LEMMA]!=u"_" or (cols[LEMMA]==u"_" and cols[FORM]==u"_"):
self.words_with_lemma_count+=1
if cols[CPOSTAG]!=u"_":
self.f_val_counter[u"CPOSTAG="+cols[CPOSTAG]]=self.f_val_counter.get(u"CPOSTAG="+cols[CPOSTAG],0)+1
if cols[FEATS]!=u"_":
for cat_is_vals in cols[FEATS].split(u"|"):
cat,vals=cat_is_vals.split(u"=",1)
for val in vals.split(u","):
self.f_val_counter[cat+u"="+val]=self.f_val_counter.get(cat+u"="+val,0)+1
if cols[DEPREL]!=u"_":
self.deprel_counter[cols[DEPREL]]=self.deprel_counter.get(cols[DEPREL],0)+1
if cols[DEPS]!=u"_":
self.words_with_deps_count+=1
for head_and_deprel in cols[DEPS].split(u"|"):
head,deprel=head_and_deprel.split(u":",1)
self.deprel_counter[deprel]=self.deprel_counter.get(deprel,0)+1
def print_basic_stats(self,out):
print >> out, "Tree count: ", self.tree_count
print >> out, "Word count: ", self.word_count
print >> out, "Token count:", self.token_count
langspec=sum(1 for deprel in self.deprel_counter.iterkeys() if u":" in deprel)
print >> out, "Dep. relations: %d of which %d language specific"%(len(self.deprel_counter),langspec)
print >> out, "POS tags:",sum(1 for cat_is_val in self.f_val_counter if cat_is_val.startswith(u"CPOSTAG="))
print >> out, "Category=value feature pairs:",sum(1 for cat_is_val in self.f_val_counter if not cat_is_val.startswith(u"CPOSTAG="))
def get_stats(self):
"""Returns a dictionary of elementary stats"""
langspec=sum(1 for deprel in self.deprel_counter.iterkeys() if u":" in deprel)
ud_rels=len(set(deprel.split(u":")[0] for deprel in self.deprel_counter.iterkeys()))
d={"tree_count":self.tree_count,"word_count":self.word_count,"token_count":self.token_count,"deprels":len(self.deprel_counter),"langspec_deprels":langspec, "universal_deprels":ud_rels, "postags":sum(1 for cat_is_val in self.f_val_counter if cat_is_val.startswith(u"CPOSTAG=")),"catvals":sum(1 for cat_is_val in self.f_val_counter if not cat_is_val.startswith(u"CPOSTAG=")),"words_with_lemma_count":self.words_with_lemma_count,"words_with_deps_count":self.words_with_deps_count}
return d
def print_deprels(self,out,which=u"UD+langspec",sort="freq"):
#which can be UD, langspec
if sort=="freq":
key=lambda x:-x[1]
elif sort=="alph":
key=lambda x:x[0].lower()
else:
print >> sys.stderr, "Unknown sort order: %s. Use --sort=freq or --sort=alph."
sys.exit(1)
for deprel,count in sorted(self.deprel_counter.iteritems(),key=key):
if u":" in deprel and u"langspec" in which:
print >> out, deprel
if u":" not in deprel and u"UD" in which:
print >> out, deprel
def print_features(self,out,which=u"UD+langspec",sort="freq"):
#1) get UD features
ud_cats=set()
# Documentation and listing of feature-value pairs has changed after release 2.7.
# The file feats.ud no longer exists. Turn the reading of the file off so that
# the script does not crash.
###!!! This means that all feature-value pairs are now treated as language-specific in the stats!
#with codecs.open(os.path.join(THISDIR,"data","feats.ud"),"r","utf-8") as f:
# for line in f:
# line=line.strip()
# if not line or line.startswith(u"#"):
# continue
# ud_cats.add(line)
if sort=="freq":
key=lambda x:-x[1]
elif sort=="alph":
key=lambda x:x[0].lower()
else:
print >> sys.stderr, "Unknown sort order: %s. Use --sort=freq or --sort=alph."
sys.exit(1)
for cat_is_val,count in sorted(self.f_val_counter.iteritems(),key=key):
cat,val=cat_is_val.split(u"=",1)
if not cat==u"CPOSTAG" and ((u"UD" in which and cat in ud_cats) or (u"langspec" in which and cat not in ud_cats)):
print >> out, cat_is_val
if __name__=="__main__":
opt_parser = argparse.ArgumentParser(description='Script for basic stats generation. Assumes a validated input.')
opt_parser.add_argument('input', nargs='+', help='Input file name (can be several files), or "-" or nothing for standard input.')
opt_parser.add_argument('--stats',action='store_true',default=False, help='Print basic stats')
opt_parser.add_argument('--jsonstats',action='store_true',default=False, help='Print basic stats as json dictionary')
opt_parser.add_argument('--deprels',default=None,help='Print deprels. The option can be "UD", "langspec", or "UD+langspec".')
opt_parser.add_argument('--catvals',default=None,help='Print category=value pairs. The option can be "UD", "langspec", or "UD+langspec". This distinction is based on the feature, not the value. WARNING: THIS DISTINCTION CURRENTLY DOES NOT WORK; USE UD+langspec.')
opt_parser.add_argument('--sort',default='freq',help='Sort the values by their frequency (freq) or alphabetically (alph). Default: %(default)s.')
args = opt_parser.parse_args() #Parsed command-line arguments
args.output="-"
inp,out=file_util.in_out(args,multiple_files=True)
trees=file_util.trees(inp)
stats=Stats()
try:
for comments,tree in trees:
stats.tree_count+=1
for cols in tree:
stats.count_cols(cols)
except:
traceback.print_exc()
print >> sys.stderr, "\n\n ------- STATS MAY BE EMPTY OR INCOMPLETE ----------"
pass
if args.stats:
stats.print_basic_stats(out)
if args.jsonstats:
d=stats.get_stats()
print json.dumps(d)
if args.deprels:
stats.print_deprels(out,args.deprels,args.sort)
if args.catvals:
stats.print_features(out,args.catvals,args.sort)