-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathold_ocr_stats.py
More file actions
140 lines (125 loc) · 5.57 KB
/
old_ocr_stats.py
File metadata and controls
140 lines (125 loc) · 5.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
'''Author: Chandra Krintz, UCSB, ckrintz@cs.ucsb.edu, AppScale BSD license'''
'''USAGE:'''
'''grep 'NewOCR\|OrigOCR\|process_JPG\|perform_OCR' old_ocr_main.out > grot'''
'''python2.7 old_ocr_stats.py --debug grot > out'''
'''old_ocr_main.out are the Main entries from runocr.sh/jpeg_processor.py runocr.out output file'''
import json, sys, argparse, csv, logging, os, time
import exifread
import numpy as np
DEBUG = True
########################
def process(img_dir_prefix,lst,ocr_list,newocr):
#...
#process_JPG (../photos2/2016 Sedgwick Pictures (8-24-2016 Monthly Update)/Main Road Water Hole (6-27-2016 to 8-10-2016)/Main Road Water Hole 6-27-2016 to 7-10-2016/100RECNX/IMG_9305.JPG) : 0.0000159740 secs
#process_jpeg: skipping upload for testing purposes
#OrigOCR: Temp is: 55
#perform_OCR : 0.3743669987 secs
#...
#process_JPG (../photos2/2016 Sedgwick Pictures (8-24-2016 Monthly Update)/Main Road Water Hole (6-27-2016 to 8-10-2016)/Main Road Water Hole 6-27-2016 to 7-10-2016/100RECNX/IMG_7941.JPG) : 0.0000169277 secs
#NewOCR: Temp is: 74
#perform_OCR : 0.3260560036 secs
#sedgwick data across 2013 through 2016 per month from: http://www.wrcc.dri.edu/cgi-bin/rawMAIN.pl?caucse
#12 months Jan-Dec, max min across years
#2013
#min_max = [(25,77), (29,79), (35,86), (30,91), (33,98), (61,103), (44,100), (42,99), (39,98), (38,90), (39,85), (22,83)]
#2013 & 2014
#min_max = [(25,86), (29,83), (35,88), (30,95), (33,99), (40,103), (44,100), (42,99), (39,99), (38,101), (37,89), (22,83)]
#2013 & 2014 & 2015
#min_max = [(25,86), (29,83), (35,91), (30,95), (33,99), (40,103), (44,100), (42,102), (39,101), (38,101), (31,89), (22,83)]
#2013 & 2014 & 2015 & 2016
min_max = [(25,86), (28,87), (31,91), (30,95), (33,99), (40,103), (43,100), (42,102), (39,101), (38,101), (31,89), (22,83)]
rlist = list(reversed(lst))
if DEBUG:
print 'rlist: {0}'.format(rlist)
#rlist: ['perform_OCR : 0.3612451553 secs\n', 'process_JPG (../photos2/2016 Sedgwick Pictures (8-24-2016 Monthly Update)/Main Road Water Hole (6-27-2016 to 8-10-2016)/Main Road Water Hole 6-27-2016 to 7-10-2016/100RECNX/IMG_9305.JPG) : 0.0000138283 secs\n']
#verify rlist[0] startsWith 'perform_OCR'
if not rlist[0].startswith('perform_OCR'):
print 'error, ele should be perform_OCR line: {0}'.format(rlist[0])
sys.exit(1)
ele = rlist[0].split() #perform_OCR : 0.3369410038 secs, extract secs
ocr_time = float(ele[2])
ele = rlist[1].split()
temp = -9999
idx = 3
try:
temp = int(ele[idx])
except ValueError as e:
pass
#skip rlist[2] if not newocr
if newocr:
idx = 2
else:
idx = 3
ele = rlist[idx].split(':')#process_JPG fname_in_parens : val secs
ftmp = ele[0][13:-2] #grab fname without first and last parens
ocr_list.append(ocr_time)
tags = None
with open(ftmp, 'rb') as fjpeg:
tags = exifread.process_file(fjpeg)
if not tags:
print 'Error: tags is None!'
sys.exit(1)
stop_tag = 'Image DateTime'
dt_tag = vars(tags[stop_tag])['printable']
#dt_tag: 2014:08:01 19:06:50
d = (dt_tag.split()[0]).replace(':','-')
t = dt_tag.split()[1]
mo_idx = int(d[5:7])-1 #min_max has base_index of 0 not 1 so subtract 1
mi = min_max[mo_idx][0]
mx = min_max[mo_idx][1]
err = False
if temp < (mi-10) or temp > (mx+10):
err = True
#get just the filename without the path
orig_fname = ftmp[ftmp.rfind('/')+1:]
if orig_fname.startswith('IMAG'):
idx = 4
elif orig_fname.startswith('IMG_'):
idx = 3
elif orig_fname.startswith('RCNX'):
idx = 3
elif orig_fname.startswith('MFDC'):
idx = 3
else:
idx = orig_fname.rindex(' ') #xxx 500.JPG
photo_id = orig_fname[idx+1:len(orig_fname)-4]
newfname = 'Main_{0}_{1}_{2}.JPG'.format(d,t,photo_id)
if err:
print '{0} {1} ERR'.format(newfname, temp)
else:
print '{0} {1}'.format(newfname, temp)
######################## main ############################
def main():
global DEBUG
logging.basicConfig()
parser = argparse.ArgumentParser(description='Process stdout from DEBUG=True check_box.py')
parser.add_argument('fn',action='store',help='filename')
parser.add_argument('img_dir_prefix',action='store',help='prefix path to the start of ../photos')
parser.add_argument('--debug',action='store_true',default=False,help='Turn debugging on (default: off)')
parser.add_argument('--newocr',action='store_true',default=False,help='Parse for newocr file')
args = parser.parse_args()
DEBUG = args.debug
#prepare the lists for collecting float values (ocr, boxcheck, and boxupload)
ocr_list = []
#produce the file via grep 'skipping\|upload_\|perform_OCR' main.out > grot
#read in the file
with open(args.fn,'r') as f:
#record until we hit 'perform_OCR' then back up through recorded list
recorded_list = []
for line in f:
recorded_list.append(line)
if line.startswith('perform_OCR'):
#process recorded list
process(args.img_dir_prefix,recorded_list,ocr_list,args.newocr)
recorded_list = []
#convert the lists to arrays
ocr_ary = np.asarray(ocr_list,dtype=np.float64)
#std = sqrt(mean(abs(x - x.mean())**2))
#count = N-1 (unbiased) when ddof=1 (N when ddof is unset):
#np.std(a,ddof=1,dtype=np.float64)
print 'OCR avg: {0} stdev: {1}, count {2}'.format(np.mean(ocr_ary,dtype=np.float64),
np.std(ocr_ary,ddof=1,dtype=np.float64),
len(ocr_ary))
##################################
if __name__ == '__main__':
main()