fix: prevent look-ahead bias in backtesting data fetchers (#475)

This commit is contained in:
Yijia-Xiao
2026-03-29 17:34:35 +00:00
parent 589b351f2a
commit e1113880a1
4 changed files with 108 additions and 142 deletions
+14 -57
View File
@@ -3,7 +3,7 @@ from datetime import datetime
from dateutil.relativedelta import relativedelta
import yfinance as yf
import os
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry, load_ohlcv, filter_financials_by_date
def get_YFin_data_online(
symbol: Annotated[str, "ticker symbol of the company"],
@@ -194,58 +194,9 @@ def _get_stock_stats_bulk(
Fetches data once and calculates indicator for all available dates.
Returns dict mapping date strings to indicator values.
"""
from .config import get_config
import pandas as pd
from stockstats import wrap
import os
config = get_config()
online = config["data_vendors"]["technical_indicators"] != "local"
if not online:
# Local data path
try:
data = pd.read_csv(
os.path.join(
config.get("data_cache_dir", "data"),
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
),
on_bad_lines="skip",
)
except FileNotFoundError:
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
else:
# Online data fetching with caching
today_date = pd.Timestamp.today()
curr_date_dt = pd.to_datetime(curr_date)
end_date = today_date
start_date = today_date - pd.DateOffset(years=15)
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
os.makedirs(config["data_cache_dir"], exist_ok=True)
data_file = os.path.join(
config["data_cache_dir"],
f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv",
)
if os.path.exists(data_file):
data = pd.read_csv(data_file, on_bad_lines="skip")
else:
data = yf_retry(lambda: yf.download(
symbol,
start=start_date_str,
end=end_date_str,
multi_level_index=False,
progress=False,
auto_adjust=True,
))
data = data.reset_index()
data.to_csv(data_file, index=False)
data = _clean_dataframe(data)
data = load_ohlcv(symbol, curr_date)
df = wrap(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
@@ -353,7 +304,7 @@ def get_fundamentals(
def get_balance_sheet(
ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
):
"""Get balance sheet data from yfinance."""
try:
@@ -363,7 +314,9 @@ def get_balance_sheet(
data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet)
else:
data = yf_retry(lambda: ticker_obj.balance_sheet)
data = filter_financials_by_date(data, curr_date)
if data.empty:
return f"No balance sheet data found for symbol '{ticker}'"
@@ -383,7 +336,7 @@ def get_balance_sheet(
def get_cashflow(
ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
):
"""Get cash flow data from yfinance."""
try:
@@ -393,7 +346,9 @@ def get_cashflow(
data = yf_retry(lambda: ticker_obj.quarterly_cashflow)
else:
data = yf_retry(lambda: ticker_obj.cashflow)
data = filter_financials_by_date(data, curr_date)
if data.empty:
return f"No cash flow data found for symbol '{ticker}'"
@@ -413,7 +368,7 @@ def get_cashflow(
def get_income_statement(
ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
):
"""Get income statement data from yfinance."""
try:
@@ -423,7 +378,9 @@ def get_income_statement(
data = yf_retry(lambda: ticker_obj.quarterly_income_stmt)
else:
data = yf_retry(lambda: ticker_obj.income_stmt)
data = filter_financials_by_date(data, curr_date)
if data.empty:
return f"No income statement data found for symbol '{ticker}'"