Skip to content

Commit 3c31b0b

Browse files
author
shijiashuai
committed
refactor: code quality improvements - cleanup service, type safety, store optimization, URL/search/folder utils
1 parent 259b6ba commit 3c31b0b

11 files changed

Lines changed: 154 additions & 124 deletions

main.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,7 @@
1212
import glob
1313
from pathlib import Path
1414

15-
# 添加项目路径
16-
PROJECT_ROOT = Path(__file__).parent
17-
sys.path.insert(0, str(PROJECT_ROOT))
18-
15+
from src import __version__
1916
from src.ai_classifier import AIBookmarkClassifier
2017
from src.bookmark_processor import BookmarkProcessor
2118
from src.cli_interface import CLIInterface
@@ -48,7 +45,7 @@ def main():
4845
)
4946

5047
# 基本参数
51-
parser.add_argument('-V', '--version', action='version', version='%(prog)s 2.0.0')
48+
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {__version__}')
5249

5350
parser.add_argument('-i', '--input', nargs='+', help='输入的HTML书签文件')
5451
parser.add_argument('-o', '--output', default='output', help='输出目录')

models/recommendation.pkl

160 Bytes
Binary file not shown.

src/__init__.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,29 @@
22
33
为现有模块提供包命名空间,便于打包与 console script 导入。
44
"""
5-
__all__ = []
65

76
__version__ = "2.0.0"
7+
8+
__all__ = [
9+
"AIBookmarkClassifier",
10+
"BookmarkProcessor",
11+
"RuleEngine",
12+
"DataExporter",
13+
"BookmarkDeduplicator",
14+
]
15+
16+
17+
def __getattr__(name: str):
18+
"""按需延迟导入,避免启动时加载全部模块。"""
19+
_mapping = {
20+
"AIBookmarkClassifier": ".ai_classifier",
21+
"BookmarkProcessor": ".bookmark_processor",
22+
"RuleEngine": ".rule_engine",
23+
"DataExporter": ".data_exporter",
24+
"BookmarkDeduplicator": ".deduplicator",
25+
}
26+
if name in _mapping:
27+
import importlib
28+
module = importlib.import_module(_mapping[name], __name__)
29+
return getattr(module, name)
30+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

src/ai_classifier.py

Lines changed: 60 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from typing import Dict, List, Tuple, Optional
1616
from dataclasses import dataclass, field
1717
from datetime import datetime
18-
from collections import defaultdict
18+
from collections import defaultdict, OrderedDict
1919
import hashlib
2020
import re
2121
from urllib.parse import urlparse
@@ -42,10 +42,14 @@
4242
SmartRuleLoader = None
4343
merge_with_main_config = None
4444

45-
# 导入占位符模块
46-
from .placeholder_modules import (
47-
SemanticAnalyzer, UserProfiler, PerformanceMonitor
48-
)
45+
# 导入核心分析组件
46+
from .semantic_analyzer import SemanticAnalyzer
47+
from .user_profiler import UserProfiler
48+
49+
try:
50+
from .performance_optimizer import PerformanceMonitor
51+
except ImportError:
52+
PerformanceMonitor = None # type: ignore[misc,assignment]
4953

5054

5155
@dataclass
@@ -107,9 +111,9 @@ def __init__(self, config_path: str = "config.json", enable_ml: bool = True, con
107111
self._ml_classifier: Optional[MLClassifierWrapper] = None
108112
self._llm_classifier: Optional[LLMClassifier] = None
109113

110-
# 缓存
114+
# 缓存(OrderedDict 实现 LRU 淘汰)
111115
self.feature_cache: Dict[str, BookmarkFeatures] = {}
112-
self.classification_cache: Dict[str, ClassificationResult] = {}
116+
self.classification_cache: OrderedDict[str, ClassificationResult] = OrderedDict()
113117
self._max_cache_size = 5000
114118

115119
# 统计
@@ -150,9 +154,12 @@ def user_profiler(self) -> UserProfiler:
150154
return self._user_profiler
151155

152156
@property
153-
def performance_monitor(self) -> PerformanceMonitor:
154-
if self._performance_monitor is None:
155-
self._performance_monitor = PerformanceMonitor()
157+
def performance_monitor(self) -> Optional['PerformanceMonitor']:
158+
if self._performance_monitor is None and PerformanceMonitor is not None:
159+
try:
160+
self._performance_monitor = PerformanceMonitor()
161+
except Exception as e:
162+
self.logger.warning(f"性能监控器初始化失败: {e}")
156163
return self._performance_monitor
157164

158165
@property
@@ -300,6 +307,7 @@ def classify(self, url: str, title: str) -> ClassificationResult:
300307
cache_key = hashlib.md5(f"{url}::{title}".encode()).hexdigest()
301308
if cache_key in self.classification_cache:
302309
self.stats['cache_hits'] += 1
310+
self.classification_cache.move_to_end(cache_key) # LRU 更新
303311
cached = self.classification_cache[cache_key]
304312
cached.processing_time = (datetime.now() - start_time).total_seconds()
305313
return cached
@@ -310,43 +318,38 @@ def classify(self, url: str, title: str) -> ClassificationResult:
310318
# 多方法融合
311319
results: List[ClassificationResult] = []
312320

321+
def _collect(raw):
322+
"""将 dict / ClassificationResult / None 统一追加到 results。"""
323+
if raw is None:
324+
return
325+
results.append(self._to_classification_result(raw))
326+
313327
# 1) 规则引擎
314-
rule_result = self.rule_engine.classify(features)
315-
if rule_result:
316-
results.append(rule_result)
328+
_collect(self.rule_engine.classify(features))
317329

318330
# 2) 机器学习
319331
if self.ml_classifier:
320-
ml_result = self.ml_classifier.classify(features)
321-
if ml_result:
322-
results.append(ml_result)
332+
_collect(self.ml_classifier.classify(features))
323333

324334
# 3) 语义分析
325335
if self.config.get('ai_settings', {}).get('use_semantic_analysis', True):
326-
semantic_result = self.semantic_analyzer.classify(features)
327-
if semantic_result:
328-
results.append(semantic_result)
336+
_collect(self.semantic_analyzer.classify(features))
329337

330338
# 4) 用户画像
331339
if self.config.get('ai_settings', {}).get('use_user_profiling', True):
332-
user_result = self.user_profiler.classify(features)
333-
if user_result:
334-
results.append(user_result)
340+
_collect(self.user_profiler.classify(features))
335341

336342
# 5) LLM(可选)
337343
if self.llm_classifier and self.llm_classifier.enabled():
338344
try:
339-
llm_result = self.llm_classifier.classify(
340-
url,
341-
title,
345+
_collect(self.llm_classifier.classify(
346+
url, title,
342347
context={
343348
'domain': features.domain,
344349
'content_type': features.content_type,
345350
'language': features.language,
346351
},
347-
)
348-
if llm_result:
349-
results.append(llm_result)
352+
))
350353
except Exception as e:
351354
self.logger.warning(f"LLM 分类调用失败: {e}")
352355

@@ -376,11 +379,31 @@ def classify(self, url: str, title: str) -> ClassificationResult:
376379
self._cache_result(cache_key, final_result)
377380
return final_result
378381

382+
@staticmethod
383+
def _to_classification_result(raw) -> ClassificationResult:
384+
"""将 dict 或 ClassificationResult 统一为 ClassificationResult。"""
385+
if isinstance(raw, ClassificationResult):
386+
return raw
387+
if isinstance(raw, dict):
388+
return ClassificationResult(
389+
category=raw.get('category', '未分类'),
390+
confidence=float(raw.get('confidence', 0.0)),
391+
subcategory=raw.get('subcategory'),
392+
reasoning=raw.get('reasoning', []),
393+
alternatives=raw.get('alternatives', []),
394+
processing_time=float(raw.get('processing_time', 0.0)),
395+
method=raw.get('method', 'unknown'),
396+
facets=raw.get('facets', {}),
397+
)
398+
raise TypeError(f"Unexpected classification result type: {type(raw)}")
399+
379400
def _cache_result(self, cache_key: str, result: ClassificationResult):
380-
if len(self.classification_cache) >= self._max_cache_size:
381-
oldest_key = next(iter(self.classification_cache))
382-
del self.classification_cache[oldest_key]
383-
self.classification_cache[cache_key] = result
401+
if cache_key in self.classification_cache:
402+
self.classification_cache.move_to_end(cache_key)
403+
else:
404+
if len(self.classification_cache) >= self._max_cache_size:
405+
self.classification_cache.popitem(last=False) # 淘汰最久未使用
406+
self.classification_cache[cache_key] = result
384407

385408
def _ensemble_classification(self, results: List[ClassificationResult], features: BookmarkFeatures) -> ClassificationResult:
386409
if not results:
@@ -406,18 +429,11 @@ def _ensemble_classification(self, results: List[ClassificationResult], features
406429
}
407430

408431
for res in results:
409-
if isinstance(res, dict):
410-
method = res.get('method', 'unknown')
411-
category = self._normalize_category_string(res.get('category', '未分类')) or '未分类'
412-
confidence = res.get('confidence', 0.0)
413-
reasoning = res.get('reasoning', [])
414-
facets = res.get('facets', {}) or {}
415-
else:
416-
method = res.method
417-
category = self._normalize_category_string(res.category) or '未分类'
418-
confidence = res.confidence
419-
reasoning = res.reasoning
420-
facets = getattr(res, 'facets', {}) or {}
432+
method = res.method
433+
category = self._normalize_category_string(res.category) or '未分类'
434+
confidence = res.confidence
435+
reasoning = res.reasoning
436+
facets = res.facets or {}
421437

422438
weight = method_weights.get(method, 0.1)
423439
category_scores[category] += confidence * weight

src/bookmark_processor.py

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@
3030
except ImportError:
3131
LLMBookmarkOrganizer = None
3232

33-
# 导入占位符模块
34-
from .placeholder_modules import (
35-
DataExporter, BookmarkDeduplicator, HealthChecker
36-
)
33+
# 导入核心组件
34+
from .data_exporter import DataExporter
35+
from .deduplicator import BookmarkDeduplicator
36+
from .bookmark_health_checker import HealthChecker
3737

3838
class BookmarkProcessor:
3939
"""书签处理器主类"""
@@ -140,23 +140,20 @@ def classifier(self):
140140
def deduplicator(self):
141141
"""Lazy loading deduplicator"""
142142
if self._deduplicator is None:
143-
from .placeholder_modules import BookmarkDeduplicator
144143
self._deduplicator = BookmarkDeduplicator()
145144
return self._deduplicator
146145

147146
@property
148147
def health_checker(self):
149148
"""Lazy loading health checker"""
150149
if self._health_checker is None:
151-
from .placeholder_modules import HealthChecker
152150
self._health_checker = HealthChecker()
153151
return self._health_checker
154152

155153
@property
156154
def exporter(self):
157155
"""Lazy loading exporter"""
158156
if self._exporter is None:
159-
from .placeholder_modules import DataExporter
160157
self._exporter = DataExporter(config=self.config)
161158
return self._exporter
162159

@@ -241,6 +238,9 @@ def process_files(self, input_files: List[str], output_dir: str = "output",
241238

242239
# 可选:调用 LLM 进行更高层次的整理
243240
self.llm_organizer_meta = None
241+
self.stats['llm_organizer_used'] = False
242+
self.stats.pop('llm_organizer_meta', None)
243+
244244
if self.llm_organizer and self.llm_organizer.enabled():
245245
try:
246246
llm_result = self.llm_organizer.organize(
@@ -257,16 +257,6 @@ def process_files(self, input_files: List[str], output_dir: str = "output",
257257
self.stats['llm_organizer_used'] = True
258258
if self.llm_organizer_meta:
259259
self.stats['llm_organizer_meta'] = self.llm_organizer_meta
260-
elif 'llm_organizer_meta' in self.stats:
261-
self.stats.pop('llm_organizer_meta', None)
262-
else:
263-
self.stats['llm_organizer_used'] = False
264-
if 'llm_organizer_meta' in self.stats:
265-
self.stats.pop('llm_organizer_meta', None)
266-
else:
267-
self.stats['llm_organizer_used'] = False
268-
if 'llm_organizer_meta' in self.stats:
269-
self.stats.pop('llm_organizer_meta', None)
270260

271261
organized_bookmarks = self._sort_organized_structure(organized_bookmarks)
272262

src/llm_classifier.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131

3232
import requests
3333

34+
from .category_utils import strip_category_prefix, normalize_category_string
35+
36+
3437
class LLMClassifier:
3538
def __init__(self, config_path: str = "config.json"):
3639
self.config_path = config_path
@@ -152,26 +155,10 @@ def _load_config(self) -> Dict:
152155

153156
@staticmethod
154157
def _strip_category_prefix(text: str) -> str:
155-
if not text:
156-
return ""
157-
s = str(text).strip()
158-
i = 0
159-
while i < len(s) and not ("\u4e00" <= s[i] <= "\u9fff" or s[i].isalnum()):
160-
i += 1
161-
return s[i:].strip() if i < len(s) else s
158+
return strip_category_prefix(text)
162159

163160
def _normalize_category_string(self, category: str) -> str:
164-
if not category:
165-
return ""
166-
cat = str(category).strip()
167-
if not cat:
168-
return ""
169-
if '/' in cat:
170-
main, sub = cat.split('/', 1)
171-
main_n = self._strip_category_prefix(main)
172-
sub_n = self._strip_category_prefix(sub)
173-
return f"{main_n}/{sub_n}" if sub_n else main_n
174-
return self._strip_category_prefix(cat)
161+
return normalize_category_string(category)
175162

176163
def _collect_valid_categories(self, config: Dict) -> List[str]:
177164
cats = []

src/placeholder_modules.py

Lines changed: 23 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,32 @@
1-
"""
2-
Placeholder Modules - Backward Compatibility Forwarding Layer
1+
"""Backward Compatibility Forwarding Layer (DEPRECATED)
2+
3+
此模块已弃用。请直接从对应模块导入:
34
4-
Original implementations have been split into independent modules:
5-
- semantic_analyzer.py -> SemanticAnalyzer
6-
- user_profiler.py -> UserProfiler
7-
- deduplicator.py -> BookmarkDeduplicator
8-
- bookmark_health_checker.py -> HealthChecker, HealthStatus
9-
- data_exporter.py -> DataExporter
10-
- performance_optimizer.py -> PerformanceMonitor
5+
from src.semantic_analyzer import SemanticAnalyzer
6+
from src.user_profiler import UserProfiler
7+
from src.deduplicator import BookmarkDeduplicator
8+
from src.bookmark_health_checker import HealthChecker, HealthStatus
9+
from src.data_exporter import DataExporter
10+
from src.performance_optimizer import PerformanceMonitor
1111
"""
12+
import warnings as _warnings
1213

13-
from .semantic_analyzer import SemanticAnalyzer
14-
from .user_profiler import UserProfiler
15-
from .deduplicator import BookmarkDeduplicator
16-
from .bookmark_health_checker import HealthChecker, HealthStatus
17-
from .data_exporter import DataExporter
14+
_warnings.warn(
15+
"placeholder_modules is deprecated; import from the real modules directly.",
16+
DeprecationWarning,
17+
stacklevel=2,
18+
)
19+
20+
from .semantic_analyzer import SemanticAnalyzer # noqa: F401
21+
from .user_profiler import UserProfiler # noqa: F401
22+
from .deduplicator import BookmarkDeduplicator # noqa: F401
23+
from .bookmark_health_checker import HealthChecker, HealthStatus # noqa: F401
24+
from .data_exporter import DataExporter # noqa: F401
1825

1926
try:
20-
from .performance_optimizer import PerformanceMonitor
27+
from .performance_optimizer import PerformanceMonitor # noqa: F401
2128
except Exception:
22-
class PerformanceMonitor:
23-
"""PerformanceMonitor fallback stub."""
24-
def __init__(self, **kwargs):
25-
self.metrics = {}
26-
def get_summary(self):
27-
return self.metrics
28-
def __getattr__(self, name):
29-
if name.startswith("_"):
30-
raise AttributeError(name)
31-
raise AttributeError(name)
32-
33-
# Keep datetime import for any legacy code that did:
34-
# from .placeholder_modules import datetime
35-
from datetime import datetime
29+
PerformanceMonitor = None # type: ignore[misc,assignment]
3630

3731
__all__ = [
3832
"SemanticAnalyzer",

0 commit comments

Comments
 (0)