For Large Time Series datasets, users with potential network instability would be advised to CHUNK large queries into smaller time segments .
The attached code breaks large queries into smaller time-based chunks (typically 30-minute segments), which:
- Reduces the likelihood of network timeouts and corruption
- Automatically retries failed chunks without losing progress
- Continues downloading even if individual chunks fail
- Provides clear feedback on download progress
import liberator
import pandas as pd
from datetime import datetime, timedelta
from typing import Union, List, Optional
import time
def getdata(dataset: str,
start_date: Union[str, datetime],
end_date: Union[str, datetime],
symbols: Union[str, List[str]],
chunk_minutes: int = 30,
max_retries: int = 3,
retry_delay: float = 1.0,
verbose: bool = True) -> pd.DataFrame:
"""
Download data by breaking the time range into smaller chunks.
Args:
dataset: Dataset name (e.g., 'spiderrock_printsets_indexed_single_tenant')
start_date: Start date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
end_date: End date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
symbols: Single symbol string or list of symbols
chunk_minutes: Minutes per chunk (default 30)
max_retries: Maximum retries per chunk (default 3)
retry_delay: Delay between retries in seconds (default 1.0)
verbose: Print progress messages (default True)
Returns:
pandas.DataFrame: Combined data for the entire time range
"""
# Parse and validate inputs
start_dt = _parse_datetime(start_date)
end_dt = _parse_datetime(end_date)
if isinstance(symbols, str):
symbols = [symbols]
if verbose:
print(f"Downloading {dataset} from {start_dt} to {end_dt} for {len(symbols)} symbols")
print(f"Using {chunk_minutes}-minute chunks")
# Generate time chunks
chunks = _generate_time_chunks(start_dt, end_dt, chunk_minutes)
if verbose:
print(f"Processing {len(chunks)} time chunks...")
# Download each chunk
all_dfs = []
failed_chunks = []
empty_chunks = []
for i, (chunk_start, chunk_end) in enumerate(chunks):
if verbose: # Progress on every chunk
# Show actual end time, not just hour:minute if it's not on a clean boundary
end_display = chunk_end.strftime('%H:%M:%S') if chunk_end.second != 0 else chunk_end.strftime('%H:%M')
print(f" Processing chunk {i+1}/{len(chunks)} ({chunk_start.strftime('%H:%M')} - {end_display})", end="")
# Download chunk with retries
result = _download_chunk_with_retry(
dataset=dataset,
start_time=chunk_start,
end_time=chunk_end,
symbols=symbols,
max_retries=max_retries,
retry_delay=retry_delay,
verbose=verbose
)
if result is None:
# Actual failure (exception occurred)
failed_chunks.append((chunk_start, chunk_end))
if verbose:
print(" - FAILED")
elif len(result) == 0:
# Empty result (no data for this time period)
empty_chunks.append((chunk_start, chunk_end))
if verbose:
print(" - EMPTY")
else:
# Success with data
all_dfs.append(result)
if verbose:
print(f" - SUCCESS ({len(result):,} rows)")
# Report results
if verbose:
total_rows = sum(len(df) for df in all_dfs)
print(f"Complete: {len(all_dfs)} chunks with data, {len(empty_chunks)} empty chunks, {len(failed_chunks)} failed chunks")
print(f"Total rows downloaded: {total_rows:,}")
if len(failed_chunks) > 0:
print(f"Warning: {len(failed_chunks)} chunks failed due to errors")
# Combine all dataframes
if all_dfs:
combined_df = pd.concat(all_dfs, ignore_index=True)
# Sort by time if columns exist
if 'muts' in combined_df.columns and '_seq' in combined_df.columns:
combined_df = combined_df.sort_values(['muts', '_seq'])
combined_df.reset_index(drop=True, inplace=True)
elif 'timestamp' in combined_df.columns:
combined_df = combined_df.sort_values('timestamp')
combined_df.reset_index(drop=True, inplace=True)
return combined_df
else:
print("No data downloaded successfully")
return pd.DataFrame()
def _parse_datetime(dt_input: Union[str, datetime]) -> datetime:
"""Parse string or datetime input into datetime object."""
if isinstance(dt_input, datetime):
return dt_input
dt_str = str(dt_input).strip()
# Try different datetime formats
formats = [
'%Y-%m-%d %H:%M:%S.%f', # Full datetime with microseconds
'%Y-%m-%d %H:%M:%S', # Full datetime
'%Y-%m-%d %H:%M', # Date with hour:minute
'%Y-%m-%d', # Date only
]
for fmt in formats:
try:
return datetime.strptime(dt_str, fmt)
except ValueError:
continue
raise ValueError(f"Unable to parse datetime: {dt_input}")
def _generate_time_chunks(start_dt: datetime, end_dt: datetime, chunk_minutes: int) -> List[tuple]:
"""Generate list of (start, end) datetime tuples for chunks."""
chunks = []
current_start = start_dt
chunk_delta = timedelta(minutes=chunk_minutes)
while current_start < end_dt:
current_end = min(current_start + chunk_delta, end_dt)
chunks.append((current_start, current_end))
current_start = current_end
return chunks
def _download_chunk_with_retry(dataset: str,
start_time: datetime,
end_time: datetime,
symbols: List[str],
max_retries: int,
retry_delay: float,
verbose: bool = True) -> Optional[pd.DataFrame]:
"""Download a single time chunk with retry logic."""
# Format times as strings for liberator
start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')
end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')
for attempt in range(max_retries):
try:
# Query the data
query_result = liberator.query(
name=dataset,
symbols=symbols,
back_to=start_str,
as_of=end_str
)
# Convert to dataframe
df = liberator.get_dataframe(query_result)
# Return the dataframe (could be empty, but that's not a failure)
return df
except Exception as e:
if attempt < max_retries - 1: # Don't sleep on last attempt
time.sleep(retry_delay)
else:
if verbose:
print(f" Failed chunk {start_time.strftime('%H:%M')}-{end_time.strftime('%H:%M')} after {max_retries} attempts: {str(e)}")
return None # Return None only on actual exception
return None
# Example usage functions
def download_single_day(dataset: str, date: str, symbols: Union[str, List[str]], **kwargs) -> pd.DataFrame:
"""
Convenience function to download a single day's data.
Args:
dataset: Dataset name
date: Date as 'YYYY-MM-DD'
symbols: Symbol(s) to download
**kwargs: Additional arguments passed to getdata()
Returns:
pandas.DataFrame: Day's data
"""
start_date = f"{date} 00:00:00"
end_date = f"{date} 23:59:59"
return getdata(
dataset=dataset,
start_date=start_date,
end_date=end_date,
symbols=symbols,
**kwargs
)
def download_date_range(dataset: str,
start_date: str,
end_date: str,
symbols: Union[str, List[str]],
**kwargs) -> pd.DataFrame:
"""
Download data across multiple days.
Args:
dataset: Dataset name
start_date: Start date as 'YYYY-MM-DD'
end_date: End date as 'YYYY-MM-DD'
symbols: Symbol(s) to download
**kwargs: Additional arguments passed to getdata()
Returns:
pandas.DataFrame: Multi-day data
"""
start_dt = _parse_datetime(f"{start_date} 00:00:00")
end_dt = _parse_datetime(f"{end_date} 23:59:59.999") # Explicit milliseconds
return getdata(
dataset=dataset,
start_date=start_dt,
end_date=end_dt,
symbols=symbols,
**kwargs
)
Here are some example calls...
# Example 1: Single day, multiple symbols
df1 = download_single_day(
dataset='spiderrock_printsets_indexed_single_tenant',
date='2025-09-08',
symbols=['AAPL', 'GOOGL', 'MSFT']
)
# Example 2: Custom time range with verbose off
df2 = getdata(
dataset='spiderrock_printsets_indexed_single_tenant',
start_date='2025-09-08 09:30:00',
end_date='2025-09-08 16:00:00',
symbols='AAPL',
chunk_minutes=15,
verbose=False # Silent operation
)
# Example 3: Multi-day download
df3 = download_date_range(
dataset='spiderrock_printsets_indexed_single_tenant',
start_date='2025-09-03',
end_date='2025-09-06',
symbols=['AAPL', 'GOOGL'],
chunk_minutes=120 # 2-hour chunks for longer periods
)
Sample Output from Example 3
Downloading spiderrock_printsets_indexed_single_tenant from 2025-09-03 00:00:00
to 2025-09-06 23:59:59.999000 for 2 symbols
Using 120-minute chunks
Processing 48 time chunks...
Processing chunk 1/48 (00:00 - 02:00) - EMPTY
Processing chunk 2/48 (02:00 - 04:00) - EMPTY
Processing chunk 3/48 (04:00 - 06:00) - EMPTY
Processing chunk 4/48 (06:00 - 08:00) - EMPTY
Processing chunk 5/48 (08:00 - 10:00) - SUCCESS (143,762 rows)
Processing chunk 6/48 (10:00 - 12:00) - SUCCESS (168,790 rows)
Processing chunk 7/48 (12:00 - 14:00) - SUCCESS (72,840 rows)
Processing chunk 8/48 (14:00 - 16:00) - SUCCESS (69,498 rows)
Processing chunk 9/48 (16:00 - 18:00) - EMPTY
Processing chunk 10/48 (18:00 - 20:00) - EMPTY
Processing chunk 11/48 (20:00 - 22:00) - EMPTY
Processing chunk 12/48 (22:00 - 00:00) - EMPTY
Processing chunk 13/48 (00:00 - 02:00) - EMPTY
Processing chunk 14/48 (02:00 - 04:00) - EMPTY
Processing chunk 15/48 (04:00 - 06:00) - EMPTY
Processing chunk 16/48 (06:00 - 08:00) - EMPTY
Processing chunk 17/48 (08:00 - 10:00) - SUCCESS (59,208 rows)
Processing chunk 18/48 (10:00 - 12:00) - SUCCESS (81,511 rows)
Processing chunk 19/48 (12:00 - 14:00) - SUCCESS (41,247 rows)
Processing chunk 20/48 (14:00 - 16:00) - SUCCESS (69,774 rows)
Processing chunk 21/48 (16:00 - 18:00) - SUCCESS (2 rows)
Processing chunk 22/48 (18:00 - 20:00) - EMPTY
Processing chunk 23/48 (20:00 - 22:00) - EMPTY
Processing chunk 24/48 (22:00 - 00:00) - EMPTY
Processing chunk 25/48 (00:00 - 02:00) - EMPTY
Processing chunk 26/48 (02:00 - 04:00) - EMPTY
Processing chunk 27/48 (04:00 - 06:00) - EMPTY
Processing chunk 28/48 (06:00 - 08:00) - EMPTY
Processing chunk 29/48 (08:00 - 10:00) - SUCCESS (61,492 rows)
Processing chunk 30/48 (10:00 - 12:00) - SUCCESS (96,447 rows)
Processing chunk 31/48 (12:00 - 14:00) - SUCCESS (43,307 rows)
Processing chunk 32/48 (14:00 - 16:00) - SUCCESS (51,698 rows)
Processing chunk 33/48 (16:00 - 18:00) - EMPTY
Processing chunk 34/48 (18:00 - 20:00) - EMPTY
Processing chunk 35/48 (20:00 - 22:00) - EMPTY
Processing chunk 36/48 (22:00 - 00:00) - EMPTY
Processing chunk 37/48 (00:00 - 02:00) - EMPTY
Processing chunk 38/48 (02:00 - 04:00) - EMPTY
Processing chunk 39/48 (04:00 - 06:00) - EMPTY
Processing chunk 40/48 (06:00 - 08:00) - EMPTY
Processing chunk 41/48 (08:00 - 10:00) - EMPTY
Processing chunk 42/48 (10:00 - 12:00) - EMPTY
Processing chunk 43/48 (12:00 - 14:00) - EMPTY
Processing chunk 44/48 (14:00 - 16:00) - EMPTY
Processing chunk 45/48 (16:00 - 18:00) - EMPTY
Processing chunk 46/48 (18:00 - 20:00) - EMPTY
Processing chunk 47/48 (20:00 - 22:00) - EMPTY
Processing chunk 48/48 (22:00 - 23:59:59) - EMPTY
Complete: 13 chunks with data, 35 empty chunks, 0 failed chunks
Total rows downloaded: 959,576