Skip to content

Commit 459a154

Browse files
author
shijiashuai
committed
对你的书签分类项目进行了以下优化:
1. 扩展规则库 agent/config/domain_rules.json: 从 25 条扩展到 176 条域名规则,覆盖 AI、编程、云服务、数据库、社区等多个领域 agent/config/keyword_rules.json: 从 35 条扩展到 173 条关键词规则,支持权重、排除规则和上下文依赖 2. 新增智能规则加载器 (src/smart_rule_loader.py) 自动加载和解析 agent/config 目录下的规则文件 支持分组格式的规则定义 支持分类名称映射,统一不同来源的分类名称 自动合并到主配置 3. 新增 URL 智能分析器 (src/url_analyzer.py) 识别 GitHub/GitLab 仓库、文档站点、博客、视频平台等 提取仓库信息(owner/repo) 检测编程语言 生成分类提示 4. 优化规则引擎 (src/rule_engine.py) 集成 URL 分析器 智能合并相似分类的得分 当 AI 和代码仓库同时匹配时,优先选择 AI 分类 5. 优化分类器 (src/ai_classifier.py) 集成智能规则加载器 调整方法权重(规则引擎 50%
1 parent 566578e commit 459a154

8 files changed

Lines changed: 2382 additions & 181 deletions

File tree

agent/config/domain_rules.json

Lines changed: 734 additions & 29 deletions
Large diffs are not rendered by default.

agent/config/keyword_rules.json

Lines changed: 912 additions & 46 deletions
Large diffs are not rendered by default.

config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"show_confidence_indicator": false,
33
"ai_settings": {
4-
"confidence_threshold": 0.7,
4+
"confidence_threshold": 0.4,
55
"use_semantic_analysis": true,
66
"use_user_profiling": true,
77
"cache_size": 10000,
@@ -668,4 +668,4 @@
668668
]
669669
}
670670
}
671-
}
671+
}

gemini.md

Lines changed: 0 additions & 96 deletions
This file was deleted.

src/ai_classifier.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@
3434

3535
from .rule_engine import RuleEngine
3636

37+
# 导入智能规则加载器
38+
try:
39+
from .smart_rule_loader import SmartRuleLoader, merge_with_main_config
40+
except ImportError:
41+
SmartRuleLoader = None
42+
merge_with_main_config = None
43+
3744
# 导入占位符模块
3845
from .placeholder_modules import (
3946
SemanticAnalyzer, UserProfiler, PerformanceMonitor
@@ -170,6 +177,17 @@ def _load_config(self) -> Dict:
170177
try:
171178
with open(self.config_path, 'r', encoding='utf-8') as f:
172179
config = json.load(f)
180+
181+
# 加载智能规则并合并
182+
if SmartRuleLoader is not None and merge_with_main_config is not None:
183+
try:
184+
loader = SmartRuleLoader()
185+
smart_rules = loader.load_all()
186+
config = merge_with_main_config(config, smart_rules)
187+
self.logger.info(f"已加载智能规则: {smart_rules.get('_meta', {})}")
188+
except Exception as e:
189+
self.logger.warning(f"智能规则加载失败,使用默认配置: {e}")
190+
173191
return self._normalize_category_config(config)
174192
except Exception as e:
175193
self.logger.error(f"配置文件加载失败: {e}")
@@ -437,9 +455,9 @@ def _ensemble_classification(self, results: List[ClassificationResult], features
437455
merged_facets: Dict[str, str] = {}
438456

439457
method_weights = {
440-
'rule_engine': 0.35,
441-
'machine_learning': 0.25,
442-
'semantic_analyzer': 0.15,
458+
'rule_engine': 0.50, # 提高规则引擎权重
459+
'machine_learning': 0.15, # 降低 ML 权重(因为模型可能过时)
460+
'semantic_analyzer': 0.10,
443461
'user_profiler': 0.10,
444462
'llm': 0.50,
445463
}

src/rule_engine.py

Lines changed: 78 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111
from collections import defaultdict
1212
from urllib.parse import urlparse
1313

14+
# 导入 URL 分析器
15+
try:
16+
from .url_analyzer import URLAnalyzer, URLAnalysis
17+
except ImportError:
18+
URLAnalyzer = None
19+
URLAnalysis = None
20+
1421
@dataclass
1522
class RuleMatch:
1623
"""规则匹配结果"""
@@ -27,6 +34,9 @@ def __init__(self, config: Dict):
2734
self.config = config
2835
self.logger = logging.getLogger(__name__)
2936

37+
# URL 分析器
38+
self.url_analyzer = URLAnalyzer() if URLAnalyzer else None
39+
3040
# 预编译规则
3141
self.compiled_rules = {}
3242
self._compile_rules()
@@ -35,7 +45,8 @@ def __init__(self, config: Dict):
3545
self.stats = {
3646
'total_matches': 0,
3747
'rule_hits': defaultdict(int),
38-
'category_predictions': defaultdict(int)
48+
'category_predictions': defaultdict(int),
49+
'url_analysis_hits': 0
3950
}
4051

4152
def _compile_rules(self):
@@ -133,11 +144,32 @@ def classify(self, features) -> Optional[Dict]:
133144
try:
134145
matches = self._find_matches(features)
135146

136-
if not matches:
147+
# 使用 URL 分析器增强分类
148+
url_hints = []
149+
if self.url_analyzer and hasattr(features, 'url'):
150+
try:
151+
analysis = self.url_analyzer.analyze(features.url)
152+
if analysis.category_hints:
153+
self.stats['url_analysis_hits'] += 1
154+
for category, confidence in analysis.category_hints:
155+
url_hints.append(RuleMatch(
156+
rule_id='url_analyzer',
157+
category=category,
158+
confidence=confidence * 15, # 增加权重
159+
matched_text=f"{analysis.site_type}:{analysis.content_type}",
160+
rule_type='url_analysis'
161+
))
162+
except Exception as e:
163+
self.logger.debug(f"URL 分析失败: {e}")
164+
165+
# 合并匹配结果
166+
all_matches = matches + url_hints
167+
168+
if not all_matches:
137169
return None
138170

139171
# 计算分类得分
140-
category_scores = self._calculate_scores(matches)
172+
category_scores = self._calculate_scores(all_matches)
141173

142174
if not category_scores:
143175
return None
@@ -152,7 +184,7 @@ def classify(self, features) -> Optional[Dict]:
152184
confidence = confidence / total_score
153185

154186
# 生成推理过程
155-
reasoning = self._generate_reasoning(matches, best_category)
187+
reasoning = self._generate_reasoning(all_matches, best_category)
156188

157189
# 生成备选分类
158190
alternatives = []
@@ -281,8 +313,49 @@ def _calculate_scores(self, matches: List[RuleMatch]) -> Dict[str, float]:
281313
"""计算分类得分"""
282314
category_scores = defaultdict(float)
283315

316+
# 检查是否有特定类型的匹配
317+
has_ai_match = any('AI' in m.category for m in matches)
318+
has_code_repo_match = any('代码仓库' in m.category for m in matches)
319+
284320
for match in matches:
285-
category_scores[match.category] += match.confidence
321+
score = match.confidence
322+
323+
# 如果同时有 AI 和代码仓库匹配,根据内容调整权重
324+
if has_ai_match and has_code_repo_match:
325+
if 'AI' in match.category:
326+
score *= 1.3 # 适度提升 AI 分类权重
327+
elif '代码仓库' in match.category:
328+
score *= 0.8 # 适度降低代码仓库权重
329+
330+
category_scores[match.category] += score
331+
332+
# 合并相似分类的得分(同一顶级分类下的子分类)
333+
merged_scores = defaultdict(float)
334+
for category, score in category_scores.items():
335+
# 提取顶级分类
336+
top_category = category.split('/')[0]
337+
merged_scores[top_category] += score
338+
339+
# 如果某个顶级分类的合并得分明显高于其他分类,选择该分类下得分最高的子分类
340+
if merged_scores:
341+
top_merged = max(merged_scores, key=merged_scores.get)
342+
top_merged_score = merged_scores[top_merged]
343+
344+
# 如果顶级分类得分占比超过 40%,选择该分类下的最佳子分类
345+
total_merged = sum(merged_scores.values())
346+
if total_merged > 0 and top_merged_score / total_merged > 0.4:
347+
# 找到该顶级分类下得分最高的子分类
348+
best_sub = None
349+
best_sub_score = 0
350+
for category, score in category_scores.items():
351+
if category.startswith(top_merged):
352+
if score > best_sub_score:
353+
best_sub = category
354+
best_sub_score = score
355+
356+
if best_sub:
357+
# 将合并得分赋给最佳子分类
358+
category_scores[best_sub] = top_merged_score
286359

287360
return dict(category_scores)
288361

0 commit comments

Comments
 (0)