-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmagnitude_by_doc.py
More file actions
32 lines (26 loc) · 1.1 KB
/
magnitude_by_doc.py
File metadata and controls
32 lines (26 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# coding=utf-8
from mrjob.job import MRJob
from mrjob.step import MRStep
import redis
# Stores the magnitude for each document.
class MRMagnitudeByDoc(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper,
reducer=self.reducer)
]
# Yields [(document name, word), occurrence] for each word in the line.
def mapper(self, _, line):
# In order to yield a pair, the word has to pass the validation filter.
doc_name, magnitude = line.split(";;")
yield doc_name, magnitude
# Yields [document name, (word, cumulative_occurrences)] for each (document_name, word) key received.
def reducer(self, doc_name, magnitudes):
for magnitude in magnitudes:
# Stores the magnitude of the document.
r.set("magnitude:" + doc_name, magnitude)
print(doc_name, magnitude)
if __name__ == '__main__':
r = redis.StrictRedis(host='gutenberg-ir.redis.cache.windows.net', port=6380, db=1,
password='B4qWA879R/U2ldA3mWT5kcJSHrDXOijbd9ju+89PNhg=', ssl=True)
MRMagnitudeByDoc.run()