Skip to content

Latest commit

ย 

History

History
114 lines (101 loc) ยท 4.67 KB

File metadata and controls

114 lines (101 loc) ยท 4.67 KB

Tiny Elephant(ํ•œ๊ตญ์–ด)

๋ฉ”๋ชจ๋ฆฌ ๊ธฐ๋ฐ˜์˜ collaborative filtering. ํ•˜์šฉํ˜ธ๋‹˜์˜ ์ด ์Šฌ๋ผ์ด๋“œ๋ฅผ ์ฐธ๊ณ ํ•˜์—ฌ ๊ตฌํ˜„ํ•˜์˜€์Šต๋‹ˆ๋‹ค. ๋ณธ ๊ตฌํ˜„์€ ํŒŒ์ด์ฌ์œผ๋กœ ๊ตฌํ˜„๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.

์ค€๋น„๋ฌผ

Snappy ์„ค์น˜

OSX

brew install snappy

Ubuntu

apt install libsnappy-dev

์„ค์น˜

pip install tiny-elephant

์‚ฌ์šฉ๋ฒ•

ํ”„๋กœ์ ํŠธ๋ฅผ ํด๋ก ํ•˜์‹œ๊ณ  ๋‹ค์Œ ์ฝ”๋“œ๋ฅผ ์‹คํ–‰ํ•ด๋ณด์„ธ์š”.

from collections import Counter
from in_memory_cluster import InMemoryCluster

# ๋ฐ์ดํ„ฐ๋Š” ๋‹ค์Œ๊ณผ ๊ฐ™์ด ๋”•์…”๋„ˆ๋ฆฌ๋กœ ๊ตฌ์„ฑํ•ด์ค๋‹ˆ๋‹ค.
data = {
    "user1": ['airplane', 'banana', 'cat', 'dog', 'elephant', 'fruit', 'google', 'hobby', 'internet', 'jogging'],
    "user2": ['cat', 'dog', 'elephant', 'fruit', 'google', 'jogging', 'kotlin'],
    "user3": ['java', 'rx', 'yahoo', 'zoo'],
    "user4": ['apple', 'banana'],
    "user5": ['airplane'],
    "user6": ['bobby', 'dog'],
    "user7": ['train', 'cat', 'exercise', 'healthy'],
    "user8": ['healthy', 'dog', 'exercise', 'banana', 'youtube'],
    "user9": ['java', 'javascript', 'rx', 'zoo', 'yahoo', 'google', 'github'],
    "user10": ['cook', 'bobby', 'dog', 'youtube'],
    "user11": ['dance', 'airplane', 'trip', 'elephant', 'fruit', 'google']
}

# ์ธ์Šคํ„ด์Šค ์ดˆ๊ธฐํ™”
imc = InMemoryCluster(
    minhash_host='localhost:6379',
    secondary_index_host='localhost:6379',
    minhash_db=1,
    secondary_index_db=2,
    seed=1
)

# MinHash DB์™€ Secondary Index DB๋ฅผ ๋‚ ๋ฆฝ๋‹ˆ๋‹ค.
# ์˜ˆ์ œ์—์„œ๋Š” ๊ฐ๊ฐ 1๋ฒˆ DB์™€ 2๋ฒˆ DB๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
imc.flush_all()

# init_cluster ๋ฉ”์†Œ๋“œ๋Š” ๋‹ค์Œ๊ณผ ๊ฐ™์ด ๋™์ž‘ํ•ฉ๋‹ˆ๋‹ค.
# 1. ๋ฐ์ดํ„ฐ๋ฅผ ๋„˜๊ฒจ ๋ฐ›์•„ Minhash ๋ฅผ ์ƒ์„ฑํ•˜๊ณ  ์ €์žฅํ•ฉ๋‹ˆ๋‹ค. (db 1).
# 2. MinHash๋ฅผ ๋ถˆ๋Ÿฌ์™€ Secondary Index๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
# 3. ๊ฐ๊ฐ์˜ Secondary Index์— ํ‚ค๋ฅผ ๋„ฃ์–ด์ค๋‹ˆ๋‹ค.(db 2)
imc.init_cluster(data)

users = data.keys()
for user in users:
    # ๋น„์Šทํ•œ ์œ ์ € 10๋ช…์„ ๋ฝ‘์Šต๋‹ˆ๋‹ค.
    print(user, imc.most_common(user, count=10))

# ์ค‘๋ณต์— ๋Œ€ํ•œ ๊ฑฑ์ •์€ ํ•˜์‹ค ํ•„์š”๊ฐ€ ์—†์–ด์š”.
update_data = {'user1': ['airplane', 'banana', 'cat'], 'user5': ['hobby', 'internet', 'jogging', 'banana', 'cat', 'dog']}

# init_cluster ์ดํ›„์—๋Š” update_cluster๋ฅผ ์‚ฌ์šฉํ•ด ์ฃผ์…”์•ผ ํ•ฉ๋‹ˆ๋‹ค.
# init_cluster ๋Š” update_cluster ๋ณด๋‹ค ๋น ๋ฆ…๋‹ˆ๋‹ค.
# ๊ทธ๋Ÿฌ๋‹ˆ ์ฒ˜์Œ ๋ฐ์ดํ„ฐ๋ฅผ ์ง‘์–ด๋„ฃ์œผ์‹ค ๋•Œ๋Š” init_cluster๋ฅผ ์‚ฌ์šฉํ•ด์ฃผ์„ธ์š”.
imc.update_cluster(update_data)

print('======== UPDATED!! =========')

# user5๋Š” ์ „๋ณด๋‹ค ๋” ๋น„์Šทํ•ด ์กŒ๋‹ค๋Š” ๊ฑธ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์–ด์š”.
for user in users:
    print(user, imc.most_common(user, count=10))

๊ทธ๋ฆฌ๊ณ  ๊ฒฐ๊ณผ๋Š” ๋‹ค์Œ๊ณผ ๊ฐ™์Šต๋‹ˆ๋‹ค.

user1 [(b'user2', 60), (b'user11', 45), (b'user5', 16), (b'user8', 13), (b'user7', 12), (b'user4', 11), (b'user6', 8), (b'user10', 5), (b'user9', 5)]
user2 [(b'user1', 60), (b'user11', 36), (b'user6', 14), (b'user10', 12), (b'user7', 12), (b'user9', 9), (b'user8', 9)]
user3 [(b'user9', 75)]
user4 [(b'user8', 23), (b'user1', 11)]
user5 [(b'user11', 25), (b'user1', 16)]
user6 [(b'user10', 67), (b'user8', 17), (b'user2', 14), (b'user1', 8)]
user7 [(b'user8', 42), (b'user1', 12), (b'user2', 12)]
user8 [(b'user7', 42), (b'user10', 37), (b'user4', 23), (b'user6', 17), (b'user1', 13), (b'user2', 9)]
user9 [(b'user3', 75), (b'user2', 9), (b'user11', 7), (b'user1', 5)]
user10 [(b'user6', 67), (b'user8', 37), (b'user2', 12), (b'user1', 5)]
user11 [(b'user1', 45), (b'user2', 36), (b'user5', 25), (b'user9', 7)]
======== UPDATED!! =========
user1 [(b'user5', 93), (b'user2', 60), (b'user11', 45), (b'user8', 13), (b'user7', 12), (b'user4', 11), (b'user6', 8), (b'user10', 5), (b'user9', 5)]
user2 [(b'user1', 60), (b'user11', 36), (b'user5', 28), (b'user6', 14), (b'user10', 12), (b'user7', 12), (b'user9', 9), (b'user8', 9)]
user3 [(b'user9', 75)]
user4 [(b'user8', 23), (b'user5', 15), (b'user1', 11)]
user5 [(b'user1', 93), (b'user2', 28), (b'user8', 20), (b'user6', 15), (b'user7', 15), (b'user4', 15), (b'user11', 13), (b'user10', 10)]
user6 [(b'user10', 67), (b'user8', 17), (b'user5', 15), (b'user2', 14), (b'user1', 8)]
user7 [(b'user8', 42), (b'user5', 15), (b'user1', 12), (b'user2', 12)]
user8 [(b'user7', 42), (b'user10', 37), (b'user4', 23), (b'user5', 20), (b'user6', 17), (b'user1', 13), (b'user2', 9)]
user9 [(b'user3', 75), (b'user2', 9), (b'user11', 7), (b'user1', 5)]
user10 [(b'user6', 67), (b'user8', 37), (b'user2', 12), (b'user5', 10), (b'user1', 5)]
user11 [(b'user1', 45), (b'user2', 36), (b'user5', 13), (b'user9', 7)]

TODO

  • ์„ฑ๋Šฅํ–ฅ์ƒ
  • ๋‹ค๋ฅธ์–ธ์–ด๋กœ ๊ตฌํ˜„ํ•˜๊ธฐ

Special Thanks

@GulliverNam