my_quant/02.py at main · phdh4/my_quant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
"""
File: 02.py
Brief:
Author: hzy
Date: 2026/1/8
"""
"""
stock_data_center.py

StockDataCenter: 单例类，负责 A 股数据的抓取 / 清洗 / 复权 / 本地存储 / 增量更新 / Data QA
设计目标：高度可扩展，方便后续接入 Backtrader 和其它 provider。

依赖（示例）:
    pip install akshare pandas pyarrow duckdb tqdm

说明:
 - 本实现以 akshare 为首选 provider（通过 akshare.* 函数拉取K线 & 复权因子），但保留 provider 抽象层，便于替换。
 - 复权逻辑实现为“前复权（forward-adjusted）”函数：在调用 apply_forward_adjustment 时，需要确认 adj_series 的含义。
 - 本地存储支持 Parquet（按 symbol 分文件）以及 DuckDB（可选）。
 - 并发使用 ThreadPoolExecutor + 简单限速 semaphore + 重试装饰器。

作者: ChatGPT for user (量化数据层骨架)
"""

import os
import time
import threading
import logging
from typing import Optional, Sequence, Callable, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import wraps
import math

import pandas as pd
import numpy as np
from tqdm import tqdm

# 可选依赖
try:
    import akshare as ak
except Exception:
    ak = None  # runtime check later

try:
    import duckdb
except Exception:
    duckdb = None

# pyarrow.writer used by pandas.to_parquet
# logging
logger = logging.getLogger("StockDataCenter")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
logger.addHandler(ch)


# ---------------------------
# 简单重试 + backoff 装饰器
# ---------------------------
def retry(exceptions=Exception, tries=4, delay=1.0, backoff=2.0, jitter=0.1):
    """
    简单的重试装饰器（阻塞）。用于网络请求等临时失败的场景。
    """
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            _tries, _delay = tries, delay
            while _tries > 1:
                try:
                    return func(*args, **kwargs)
                except exceptions as e:
                    sleep_time = _delay + (np.random.rand() * jitter)
                    logger.warning(f"Call {func.__name__} failed: {e!r}, retrying in {sleep_time:.2f}s ({_tries-1} tries left)")
                    time.sleep(sleep_time)
                    _tries -= 1
                    _delay *= backoff
            # last attempt
            return func(*args, **kwargs)
        return wrapper
    return decorator


# ---------------------------
# 单例 StockDataCenter 类
# ---------------------------
class StockDataCenter:
    """
    StockDataCenter 单例，负责：
      - 从 provider 获取原始数据（OHLCV、复权因子、退市历史等）
      - 数据清洗与 QA
      - 复权（前复权）
      - 本地存储（Parquet 或 DuckDB）
      - 增量更新（按交易日），并行/限速/重试
      - 导出为 Backtrader 可识别的 DataFrame
    架构要点：
      - provider 层抽象（self.provider）: 方便增加其它数据源
      - 存储层抽象（parquet / duckdb）
    使用示例见文件末尾
    """

    _instance = None
    _instance_lock = threading.Lock()

    def __new__(cls, *args, **kwargs):
        with cls._instance_lock:
            if cls._instance is None:
                cls._instance = super(StockDataCenter, cls).__new__(cls)
        return cls._instance

    def __init__(
        self,
        data_dir: str = "data",
        storage_backend: str = "parquet",  # or 'duckdb'
        duckdb_path: Optional[str] = None,
        request_rate_per_sec: float = 1.0,
        max_workers: int = 8,
        provider: Optional[Any] = None,
        allow_overwrite: bool = False,
    ):
        # init 只在第一次实例化时生效（单例）
        if getattr(self, "_initialized", False):
            return
        self._initialized = True

        self.data_dir = data_dir
        os.makedirs(self.data_dir, exist_ok=True)
        self.storage_backend = storage_backend
        self.duckdb_path = duckdb_path or os.path.join(self.data_dir, "stock_data.duckdb")
        self.request_rate_per_sec = max(request_rate_per_sec, 0.01)
        self._min_interval = 1.0 / self.request_rate_per_sec
        self._last_request_ts = 0.0
        self._last_request_lock = threading.Lock()
        self.max_workers = max_workers
        self.allow_overwrite = allow_overwrite

        # provider：必须实现 fetch_ohlcv(symbol, start, end) -> pd.DataFrame,
        # 可选：fetch_adj_factors(symbol, start, end) -> pd.Series (index 对齐 date)
        if provider is None:
            if ak is None:
                raise RuntimeError("akshare not installed and no provider provided. Install akshare or pass a provider.")
            self.provider = AkshareProvider()
        else:
            self.provider = provider

        # duckdb init
        if self.storage_backend == "duckdb" and duckdb is None:
            raise RuntimeError("duckdb not installed but storage_backend='duckdb' selected. Install duckdb or switch to parquet.")

        # concurrency semaphore for rate limiting (basic)
        # 限制同时并发请求数（防止短时间内大量请求）
        self._semaphore = threading.Semaphore(self.max_workers)

        logger.info(f"StockDataCenter initialized: data_dir={self.data_dir}, backend={self.storage_backend}, max_workers={self.max_workers}")

    # ------------
    # 内部工具
    # ------------
    def _rate_limited_call(self, func: Callable, *args, **kwargs):
        """
        基于简单时间间隔的限速：调用前确保距离上次请求 >= min_interval
        同时使用 semaphore 控制并发数。
        """
        with self._semaphore:
            with self._last_request_lock:
                now = time.time()
                wait = self._min_interval - (now - self._last_request_ts)
                if wait > 0:
                    time.sleep(wait)
                self._last_request_ts = time.time()
            return func(*args, **kwargs)

    # -----------------------------
    # Provider wrapper 调用（含 retry）
    # -----------------------------
    @retry(Exception, tries=4, delay=1.0, backoff=2.0, jitter=0.3)
    def _fetch_ohlcv_from_provider(self, symbol: str, start: Optional[str], end: Optional[str]) -> pd.DataFrame:
        return self._rate_limited_call(self.provider.fetch_ohlcv, symbol, start, end)

    @retry(Exception, tries=3, delay=1.0, backoff=2.0, jitter=0.2)
    def _fetch_adj_from_provider(self, symbol: str, start: Optional[str], end: Optional[str]) -> Optional[pd.Series]:
        if not hasattr(self.provider, "fetch_adj_factor"):
            return None
        return self._rate_limited_call(self.provider.fetch_adj_factor, symbol, start, end)

    # -----------------------------
    # Data QA / 清洗
    # -----------------------------
    def qa_check(self, df: pd.DataFrame, symbol: Optional[str] = None) -> pd.DataFrame:
        """
        基本的 Data QA，返回处理后的 DataFrame 或者抛异常。
        Checks:
          - 必要列存在: date/index, open, high, low, close, volume
          - 日期索引连续性（不强制严格连续，但至少是日期类型、无重复）
          - 去重
          - 非法值检测（NaN, 负值）
        """
        if df is None or len(df) == 0:
            raise ValueError(f"Empty data for {symbol}")

        # 标准化列名：将 date 转为 datetime index
        if "date" in df.columns:
            df = df.copy()
            df["date"] = pd.to_datetime(df["date"])
            df = df.sort_values("date").drop_duplicates(subset=["date"], keep="last").set_index("date")
        elif isinstance(df.index, pd.DatetimeIndex):
            df = df.sort_index()
        else:
            # try to coerce index to datetime
            try:
                df = df.copy()
                df.index = pd.to_datetime(df.index)
                df = df.sort_index()
            except Exception:
                raise ValueError("DataFrame must contain 'date' column or datetime index")

        required_cols = ["open", "high", "low", "close", "volume"]
        for c in required_cols:
            if c not in df.columns:
                raise ValueError(f"Required column '{c}' missing in data for {symbol}")

        # basic numeric cleaning
        df = df[~df.index.duplicated(keep="last")]
        # remove rows where all price columns are NaN
        price_cols = ["open", "high", "low", "close"]
        df = df.dropna(axis=0, how="all", subset=price_cols)

        # fill small NaNs by forward/back fill but be conservative
        if df[price_cols].isnull().any().any():
            logger.info(f"Filling small gaps in price data for {symbol} by forward/back fill")
            df[price_cols] = df[price_cols].ffill().bfill()

        # volume should be non-negative
        if (df["volume"] < 0).any():
            raise ValueError("Negative volume detected")

        # reasonable price ranges check (optional)
        if (df["close"] <= 0).any():
            raise ValueError("Non-positive close price detected")

        return df

    # -----------------------------
    # 复权（前复权）
    # -----------------------------
    def apply_forward_adjustment(self, df: pd.DataFrame, adj_series: pd.Series) -> pd.DataFrame:
        """
        Apply forward-adjustment (前复权) to OHLC columns using adj_series.
        adj_series: pd.Series indexed by date, representing cumulative adjustment factor or factor relative to raw price.
        IMPORTANT: 不同数据源 adj 的定义可能不同，请确保 adj_series 的含义：
           - 我假设 adj_series[t] 表示该日用于将“当日原始价格”换算到“基准（最新日）”的因子序列（常见方式）。
           - 典型公式: adj_price[t] = raw_price[t] * (adj_latest / adj[t])
        你可能需要根据 akshare 返回的复权因子调整这个公式。
        """
        if adj_series is None or len(adj_series) == 0:
            logger.info("No adjustment series provided; returning original df")
            return df

        # align by index
        adj = adj_series.copy()
        adj.index = pd.to_datetime(adj.index)
        df = df.copy()
        df_index = df.index
        adj = adj.reindex(df_index).ffill().bfill()

        # compute forward adjustment factor: factor = adj_latest / adj_current
        latest = adj.iloc[-1]
        if latest == 0 or np.isnan(latest):
            logger.warning("Latest adj factor is zero or NaN; skipping adjustment")
            return df
        factor = latest / adj

        for col in ["open", "high", "low", "close"]:
            if col in df.columns:
                df[col] = df[col] * factor

        # volume typically *not* adjusted for price adjustment; for some systems you may divide volume by factor
        # keep volume as-is, but document
        return df

    # -----------------------------
    # 本地存储：Parquet / DuckDB
    # -----------------------------
    def _symbol_parquet_path(self, symbol: str) -> str:
        return os.path.join(self.data_dir, f"{symbol}.parquet")

    def save_to_parquet(self, symbol: str, df: pd.DataFrame, mode: str = "append"):
        path = self._symbol_parquet_path(symbol)
        # ensure index is column 'date' before writing, to simplify merges later
        df_to_write = df.copy()
        df_to_write = df_to_write.reset_index().rename(columns={"index": "date"})
        if mode == "overwrite" or (not os.path.exists(path)):
            df_to_write.to_parquet(path, index=False)
        elif mode == "append":
            # load existing and merge carefully by date to prevent accidental overwrite
            existing = pd.read_parquet(path)
            combined = pd.concat([existing, df_to_write], ignore_index=True)
            combined = combined.drop_duplicates(subset=["date"], keep="last")
            combined = combined.sort_values("date")
            combined.to_parquet(path, index=False)
        else:
            raise ValueError("Unknown mode for save_to_parquet")

    def load_from_parquet(self, symbol: str) -> Optional[pd.DataFrame]:
        path = self._symbol_parquet_path(symbol)
        if not os.path.exists(path):
            return None
        df = pd.read_parquet(path)
        if "date" in df.columns:
            df["date"] = pd.to_datetime(df["date"])
            df = df.set_index("date").sort_index()
        return df

    def save_to_duckdb(self, symbol: str, df: pd.DataFrame, table_name: Optional[str] = None, mode: str = "append"):
        if duckdb is None:
            raise RuntimeError("duckdb not installed")
        conn = duckdb.connect(database=self.duckdb_path)
        tname = table_name or f"symbol_{symbol.replace('.', '_')}"
        df_to_write = df.copy().reset_index().rename(columns={"index": "date"})
        # duckdb supports upsert via SQL; simplest is to create table then merge
        if mode == "overwrite":
            conn.execute(f"DROP TABLE IF EXISTS {tname}")
            conn.register("tmp", df_to_write)
            conn.execute(f"CREATE TABLE {tname} AS SELECT * FROM tmp")
        else:
            # append, but protect duplicates by date: read existing, concat, dedup, write back
            try:
                existing = conn.execute(f"SELECT * FROM {tname}").df()
                combined = pd.concat([existing, df_to_write], ignore_index=True)
                combined = combined.drop_duplicates(subset=["date"], keep="last").sort_values("date")
                conn.execute(f"DROP TABLE IF EXISTS {tname}")
                conn.register("tmp", combined)
                conn.execute(f"CREATE TABLE {tname} AS SELECT * FROM tmp")
            except Exception:
                # table not exist yet
                conn.register("tmp", df_to_write)
                conn.execute(f"CREATE TABLE {tname} AS SELECT * FROM tmp")
        conn.close()

    def load_from_duckdb(self, symbol: str, table_name: Optional[str] = None) -> Optional[pd.DataFrame]:
        if duckdb is None:
            raise RuntimeError("duckdb not installed")
        conn = duckdb.connect(database=self.duckdb_path)
        tname = table_name or f"symbol_{symbol.replace('.', '_')}"
        try:
            df = conn.execute(f"SELECT * FROM {tname}").df()
            if "date" in df.columns:
                df["date"] = pd.to_datetime(df["date"])
                df = df.set_index("date").sort_index()
            conn.close()
            return df
        except Exception:
            conn.close()
            return None

    # -----------------------------
    # 增量更新与主流程
    # -----------------------------
    def incremental_update_symbol(self, symbol: str, start: Optional[str] = None, end: Optional[str] = None, force: bool = False) -> pd.DataFrame:
        """
        按 symbol 增量更新并写入本地存储：
         - 检查本地已有数据的最新日期，从 next_trade_day 开始拉取；
         - 若本地无数据，则拉取 start->end (start 可由外界指定)；
         - 合并后 QA 校验 -> 保存（Parquet 或 DuckDB）
        返回合并后的 DataFrame（index datetime）
        """
        logger.info(f"Updating symbol {symbol}")
        local_df = self.load(symbol)
        # determine fetch range
        if local_df is None or force:
            fetch_start = start
        else:
            last_local_date = local_df.index.max()
            # next calendar day - but we assume provider will handle weekends/非交易日
            fetch_start = (last_local_date + pd.Timedelta(days=1)).strftime("%Y%m%d")
        fetch_end = end

        # fetch raw
        raw = self._fetch_ohlcv_from_provider(symbol, fetch_start, fetch_end)
        if raw is None or len(raw) == 0:
            logger.info(f"No new data fetched for {symbol}")
            # still return local copy if exists
            return local_df

        # qa
        raw = self.qa_check(raw, symbol)

        # try fetch adj series and adjust
        adj_series = None
        try:
            adj_series = self._fetch_adj_from_provider(symbol, fetch_start, fetch_end)
        except Exception as e:
            logger.warning(f"Failed to fetch adj factor for {symbol}: {e}")

        if adj_series is not None:
            raw_adj = self.apply_forward_adjustment(raw, adj_series)
        else:
            raw_adj = raw

        # merge with local
        if local_df is None:
            merged = raw_adj
        else:
            merged = pd.concat([local_df, raw_adj]).sort_index()
            merged = merged[~merged.index.duplicated(keep="last")]

        # final QA before save (additional checks)
        merged = self.qa_check(merged, symbol)

        # save
        if self.storage_backend == "parquet":
            self.save_to_parquet(symbol, merged, mode="overwrite" if self.allow_overwrite else "append")
        else:
            self.save_to_duckdb(symbol, merged, mode="overwrite" if self.allow_overwrite else "append")

        logger.info(f"Symbol {symbol} updated: {len(merged)} rows")
        return merged

    def update_many_symbols(self, symbols: Sequence[str], start: Optional[str] = None, end: Optional[str] = None, threads: Optional[int] = None):
        """
        批量更新：并行 fetch + 按 symbol incremental_update_symbol
        - 默认线程数 self.max_workers
        - 采用 ThreadPoolExecutor，捕获异常
        """
        threads = threads or self.max_workers
        results: Dict[str, Optional[pd.DataFrame]] = {}
        with ThreadPoolExecutor(max_workers=threads) as exe:
            futures = {exe.submit(self.incremental_update_symbol, s, start, end): s for s in symbols}
            for fut in tqdm(as_completed(futures), total=len(futures), desc="更新 symbols"):
                s = futures[fut]
                try:
                    df = fut.result()
                    results[s] = df
                except Exception as e:
                    logger.exception(f"Failed to update {s}: {e}")
                    results[s] = None
        return results

    # -----------------------------
    # Load wrapper（根据 backend）
    # -----------------------------
    def load(self, symbol: str) -> Optional[pd.DataFrame]:
        if self.storage_backend == "parquet":
            return self.load_from_parquet(symbol)
        else:
            return self.load_from_duckdb(symbol)

    # -----------------------------
    # 转为 Backtrader DataFrame
    # -----------------------------
    def to_backtrader_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        将 df 转为 Backtrader-friendly 格式：
          - index: datetime index 或者名为 'date' 的列 -> 转为 index
          - columns: 'open','high','low','close','volume','openinterest'
        Backtrader 要求：index 为 datetime（最好包含时间）
        """
        if df is None or len(df) == 0:
            return df
        df2 = df.copy()
        if not isinstance(df2.index, pd.DatetimeIndex):
            if "date" in df2.columns:
                df2["date"] = pd.to_datetime(df2["date"])
                df2 = df2.set_index("date")
            else:
                raise ValueError("DataFrame must have datetime index or 'date' column")
        # ensure columns exist
        for c in ["open", "high", "low", "close", "volume"]:
            if c not in df2.columns:
                raise ValueError(f"Missing column {c} for Backtrader")
        # backtrader prefers float and int types
        df2[["open", "high", "low", "close"]] = df2[["open", "high", "low", "close"]].astype(float)
        df2["volume"] = df2["volume"].astype(float)
        if "openinterest" not in df2.columns:
            df2["openinterest"] = 0
        return df2[["open", "high", "low", "close", "volume", "openinterest"]]

    # -----------------------------
    # Surviviorship bias / delisted stocks
    # -----------------------------
    def fetch_delisted_history(self, symbol: str, start: Optional[str] = None, end: Optional[str] = None) -> Optional[pd.DataFrame]:
        """
        尽量尝试获取退市（已停牌/退市）股票的历史数据，避免生存偏差。
        provider 需实现 fetch_delisted_history 方法（可选）。
        """
        if hasattr(self.provider, "fetch_delisted_history"):
            return self.provider.fetch_delisted_history(symbol, start, end)
        else:
            logger.warning("Provider does not implement fetch_delisted_history; survivorship data may be incomplete.")
            return None

    # 更多工具函数可放置此处（例如：交易日历获取、节假日处理等）
    def fetch_trade_calendar(self, start: Optional[str] = None, end: Optional[str] = None) -> pd.DatetimeIndex:
        """
        尝试从 provider 获取交易日历；如果 provider 不支持，则退回到 pandas bdate_range（注意 A 股节假日问题）
        """
        if hasattr(self.provider, "fetch_trade_calendar"):
            cal = self.provider.fetch_trade_calendar(start, end)
            return pd.to_datetime(cal)
        else:
            logger.warning("Provider does not provide trade calendar; using pandas.bdate_range fallback (may be inaccurate for A股节假日)")
            s = pd.to_datetime(start) if start is not None else pd.Timestamp("2000-01-01")
            e = pd.to_datetime(end) if end is not None else pd.Timestamp.today()
            return pd.bdate_range(s, e)

# ---------------------------
# Provider: akshare 的简单实现（可根据你本地 akshare 版本微调）
# ---------------------------
class AkshareProvider:
    """
    简单封装 akshare 的常用接口
    请根据你本地 akshare 版本确认函数名 / 参数 / 返回字段是否一致。
    """

    def __init__(self):
        if ak is None:
            raise RuntimeError("akshare required for AkshareProvider but not installed")
        # akshare 无需额外认证

    def fetch_ohlcv(self, symbol: str, start: Optional[str], end: Optional[str]) -> pd.DataFrame:
        """
        使用 akshare 拉取日线（示例函数名：ak.stock_zh_a_daily）
        symbol: akshare 的标识，例如 'sh600000' 或 '600000'（视 akshare 版本）
        start/end: 字符串，格式 'YYYYMMDD'，传 None 则 akshare 默认为全部
        返回：DataFrame 包含 date/open/high/low/close/volume 等列，date 作为列或索引
        """
        # 这里给出常见 akshare 接口示例，请根据你本地 akshare 调整
        # ak.stock_zh_a_daily(symbol='sh600000', start_date='20100101', end_date='20240101')
        # 注意：有些 akshare 需要 symbol 形如 "sh600000", 有些则需要 "600000" + exchange 参数
        try:
            # 尝试常见调用
            df = ak.stock_zh_a_daily(symbol=symbol, start_date=start, end_date=end)
        except Exception as e:
            # 如果失败，请根据 akshare 版本做适配
            logger.exception(f"ak.stock_zh_a_daily failed for {symbol}: {e}")
            raise

        # normalize columns: akshare 通常返回 columns：date, open, close, high, low, volume, amount
        # reorder to open/high/low/close/volume
        df = df.rename(columns={c: c.lower() for c in df.columns})
        # ensure required columns exist
        # some akshare return 'vol' instead of 'volume'
        if "vol" in df.columns and "volume" not in df.columns:
            df["volume"] = df["vol"]
        # keep only essential
        keep = [c for c in ["date", "open", "high", "low", "close", "volume"] if c in df.columns]
        df = df[keep]
        # ensure date column exists
        if "date" not in df.columns and isinstance(df.index, pd.DatetimeIndex):
            df = df.reset_index().rename(columns={"index": "date"})
        return df

    def fetch_adj_factor(self, symbol: str, start: Optional[str], end: Optional[str]) -> Optional[pd.Series]:
        """
        尝试从 akshare 获取复权因子（如果可用）
        akshare 可能提供不同接口/字段，示例函数在不同版本上可能不同：
          - ak.stock_zh_a_daily_qfq / ak.stock_zh_a_adj_factor / stock_zh_a_daily_qfq
        你需要根据本机 akshare 版本调整此处实现。
        """
        # 尝试一些常见接口（请根据 akshare 版本修改）
        try:
            # akshare 早期有 ak.stock_zh_a_daily_qfq 返回前复权的日线；如果存在，直接使用该接口（它已经返回前复权价格）
            if hasattr(ak, "stock_zh_a_daily_qfq"):
                df_qfq = ak.stock_zh_a_daily_qfq(symbol=symbol, start_date=start, end_date=end)
                # df_qfq 通常包含 date 和 close 等，若直接返回前复权日线，则我们不需要再单独处理复权因子
                # build a synthetic adj factor series of ones to indicate adjusted
                s = pd.Series(1.0, index=pd.to_datetime(df_qfq["date"]))
                return s

            # 如果 ak 有提取复权因子的接口，可在此添加
            # fallback: None
            return None
        except Exception as e:
            logger.exception(f"Failed to fetch adj factor for {symbol}: {e}")
            return None

    # 可选：fetch delisted history / trade calendar
    def fetch_delisted_history(self, symbol: str, start: Optional[str], end: Optional[str]) -> Optional[pd.DataFrame]:
        # akshare 可能有接口，但这里我们默认 None
        return None

    def fetch_trade_calendar(self, start: Optional[str], end: Optional[str]):
        # akshare 有 trade_date_hist_sina 或者 other calendar API，可实现
        # TODO: implement if needed
        return None


# ---------------------------
# 使用示例
# ---------------------------
if __name__ == "__main__":
    # 示例：批量更新 3 支股票（注意 symbol 的写法需与你的 akshare 版本匹配）
    sdc = StockDataCenter(data_dir="data", storage_backend="parquet", request_rate_per_sec=2.0, max_workers=6)
    symbols = ["sh600000", "sz000001", "sh600519"]  # 请按实际 akshare 要求填写
    results = sdc.update_many_symbols(symbols, start="20000101", end=None)
    # 导出某个股票供 Backtrader 使用
    df = sdc.load("sh600000")
    if df is not None:
        bt_df = sdc.to_backtrader_df(df)
        print(bt_df.tail())