Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e73efcc
手搓版本
zxhhai Oct 13, 2024
a71dc79
Update README.md
zxhhai Oct 13, 2024
8de8df4
Update README.md
zxhhai Oct 13, 2024
df4f4df
Update README.md
zxhhai Oct 13, 2024
1a373e5
Update README.md
zxhhai Oct 13, 2024
facc84a
Update README.md
zxhhai Oct 13, 2024
e5dd31b
Update README.md
zxhhai Oct 13, 2024
41a6e4d
Merge branch 'HandMadeRAG' of https://github.com/LLLLLLLLazy/PythonAs…
zxhhai Oct 13, 2024
47a0585
Update README.md
zxhhai Oct 13, 2024
3af7bfa
Update README.md
zxhhai Oct 13, 2024
d9e6443
Update README.md
zxhhai Oct 13, 2024
ef3761a
Update README.md
zxhhai Oct 13, 2024
70f2bc9
修复BUG
zxhhai Oct 13, 2024
38879dc
更新知识文件
zxhhai Oct 13, 2024
8ee5564
Create download_embedding_model.py
zxhhai Oct 13, 2024
0d2fc5f
Localize embbedding_model
zxhhai Oct 13, 2024
02eba8b
update
zxhhai Oct 13, 2024
5cefd3c
Update README.md
zxhhai Oct 13, 2024
928ee0c
Update README.md
zxhhai Oct 13, 2024
07cad54
Update README.md
zxhhai Oct 13, 2024
b525561
update
zxhhai Oct 13, 2024
3b22966
commit
zxhhai Oct 14, 2024
97671df
commit
zxhhai Oct 14, 2024
2622488
commit
zxhhai Oct 14, 2024
e6dd291
commit
zxhhai Oct 14, 2024
26dad4a
add BASELINE
zxhhai Oct 15, 2024
3c4c49d
update prompt template
zxhhai Oct 15, 2024
eb62fb7
update prompt template
zxhhai Oct 15, 2024
4c1e332
Add Runner class
zxhhai Oct 18, 2024
ddd00ee
Add Runner class
zxhhai Oct 18, 2024
6e7e4f8
模型改为API调用,优化了类关系
zxhhai Dec 21, 2024
ce8055c
llm update to deepseek-r1
zxhhai Feb 25, 2025
8e1944d
llm update to deepseek-r1
zxhhai Feb 25, 2025
c640024
update
zxhhai Mar 12, 2025
fa720d3
Delete tempCodeRunnerFile.python
zxhhai Mar 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .gitignore

This file was deleted.

151 changes: 151 additions & 0 deletions Database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import os
import json
from openai import OpenAI
import numpy as np


from config import FILES_PATH, DATABASE_PATH, K, BASELINE, DATABASE_NAME
from chunker import chunking
from embedding import embedding




class Database():
def __init__(self, name: str)->None:
self.db = os.path.join(DATABASE_PATH, (name + '.json'))

if not os.path.exists(self.db):
print(f"{self.db} not found.")
raise FileNotFoundError

with open(self.db, 'r', encoding = 'utf-8') as f:
data = json.load(f)
self.files = data["files"]

self.embedding_client = OpenAI(
api_key="sk-2345ecb2afae4811abbe33775a3e8e87", # 如果您没有配置环境变量,请在此处用您的API Key进行替换
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" # 填写百炼服务的base_url
)

def cosine_similarity(self, vector1: list[float], vector2: list[float]) -> float:
'''
计算两个向量的余弦相似度
输入:两个向量
输出:余弦相似度
'''
return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

def top_k_chunks(self, query: str, k: int=K, baseline: float=BASELINE) -> list[str]:
'''
返回与查询最相似的k个chunk
输入:查询字符串,k
输出:最相似的k个chunk
'''
similarities = {}

# 生成查询的embedding
query_vector = np.array(embedding(self.embedding_client, query))

# 读取知识库
with open(self.db, 'r', encoding = 'utf-8') as f:
data = json.load(f)

# 计算余弦相似度
for _file, chunk_vectors in data["vectors"].items():
for chunk, vector in chunk_vectors.items():
vector = np.array(vector)
similarity = self.cosine_similarity(query_vector, vector)
similarities[chunk] = similarity

# 获取最相似的k个chunk
top_k = sorted(similarities, key=similarities.get, reverse=True)[:k]

# 移除相似度低于BASELINE的chunk
for chunk in top_k:
if similarities[chunk] < baseline:
top_k.remove(chunk)

return top_k


class DynamicDatabase(Database):
def __init__(self, name: str)->None:
super().__init__(name)
if not os.path.exists(self.db):
with open(self.db, 'w', encoding = 'utf-8') as f:
json.dump({"files": [], "vectors": {}}, f, ensure_ascii=False)

def add(self, file_name: str)->None:
'''
添加文件到知识库
输入:文件名
输出:无
'''

file = os.path.join(FILES_PATH , file_name)

# 读取文件为str
with open(file, 'r', encoding = 'utf-8') as f:
text = f.read()

# 分块
chunks = chunking(text)

# 生成chunk的embedding
chunks_vector = {}
for chunk in chunks:
if chunk:
chunks_vector[chunk] = embedding(self.embedding_client, chunk)

# 保存到知识库
with open(self.db, 'r', encoding = 'utf-8') as f:
data = json.load(f)

data["vectors"][file_name] = chunks_vector

self.files.append(file_name)
data["files"] = self.files
with open(self.db, 'w', encoding = 'utf-8') as f:
json.dump(data, f, ensure_ascii=False)

def delete(self, file_name: str)->None:
'''
从知识库删除文件
输入:文件名
输出:无
'''
with open(self.db, 'r', encoding = 'utf-8') as f:
data = json.load(f)

data["vectors"].pop(file_name)
self.files.remove(file_name)
data["files"] = self.files
with open(self.db, 'w', encoding = 'utf-8') as f:
json.dump(data, f, ensure_ascii=False)

def init_database():
database = DynamicDatabase(DATABASE_NAME)

FILES = os.listdir(FILES_PATH)
files = FILES.copy()

for file in database.files:
if file not in files:
files.append(file)

for file in files:
if file in database.files and file in FILES:
print(f"{file} already in {database.db}.")
continue
elif file in database.files:
database.delete(file)
print(f"{file} deleted from {database.db}.")
else:
database.add(file)
print(f"{file} added to {database.db}.")

print(f"{database.db} initialized.")

if __name__ == "__main__":
init_database()
9 changes: 9 additions & 0 deletions Embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from config import EMBEDDING_NAME

def embedding(client, chunk: str) -> list[float]:
completion = client.embeddings.create(
model=EMBEDDING_NAME,
input=chunk,
encoding_format="float"
)
return completion.data[0].embedding
160 changes: 0 additions & 160 deletions RAG_ai/Copy.py

This file was deleted.

Loading