Downloading Very Large Datasets
For large time series datasets, users experiencing network instability should chunk large queries into smaller time segments rather than attempting single monolithic downloads.Core Strategy
The recommended approach divides large queries into smaller, manageable time-based chunks (typically 30-minute segments). This methodology:- Reduces network timeout and data corruption likelihood
- Automatically retries failed chunks without losing progress
- Continues downloading even when individual chunks fail
- Provides clear feedback on download progress
Complete Python Implementation
import liberator
import pandas as pd
from datetime import datetime, timedelta
from typing import Union, List, Optional
import time
def getdata(dataset: str,
start_date: Union[str, datetime],
end_date: Union[str, datetime],
symbols: Union[str, List[str]],
chunk_minutes: int = 30,
max_retries: int = 3,
retry_delay: float = 1.0,
verbose: bool = True) -> pd.DataFrame:
"""
Download data by breaking the time range into smaller chunks.
Args:
dataset: Dataset name (e.g., 'spiderrock_printsets_indexed_single_tenant')
start_date: Start date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
end_date: End date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
symbols: Single symbol string or list of symbols
chunk_minutes: Minutes per chunk (default 30)
max_retries: Maximum retries per chunk (default 3)
retry_delay: Delay between retries in seconds (default 1.0)
verbose: Print progress messages (default True)
Returns:
pandas.DataFrame: Combined data for the entire time range
"""
# Parse and validate inputs
start_dt = _parse_datetime(start_date)
end_dt = _parse_datetime(end_date)
if isinstance(symbols, str):
symbols = [symbols]
if verbose:
print(f"Downloading {dataset} from {start_dt} to {end_dt} for {len(symbols)} symbols")
print(f"Using {chunk_minutes}-minute chunks")
# Generate time chunks
chunks = _generate_time_chunks(start_dt, end_dt, chunk_minutes)
if verbose:
print(f"Processing {len(chunks)} time chunks...")
# Download each chunk
all_dfs = []
failed_chunks = []
empty_chunks = []
for i, (chunk_start, chunk_end) in enumerate(chunks):
if verbose:
end_display = chunk_end.strftime('%H:%M:%S') if chunk_end.second != 0 else chunk_end.strftime('%H:%M')
print(f" Processing chunk {i+1}/{len(chunks)} ({chunk_start.strftime('%H:%M')} - {end_display})", end="")
# Download chunk with retries
result = _download_chunk_with_retry(
dataset=dataset,
start_time=chunk_start,
end_time=chunk_end,
symbols=symbols,
max_retries=max_retries,
retry_delay=retry_delay,
verbose=verbose
)
if result is None:
# Actual failure (exception occurred)
failed_chunks.append((chunk_start, chunk_end))
if verbose:
print(" - FAILED")
elif len(result) == 0:
# Empty result (no data for this time period)
empty_chunks.append((chunk_start, chunk_end))
if verbose:
print(" - EMPTY")
else:
# Success with data
all_dfs.append(result)
if verbose:
print(f" - SUCCESS ({len(result):,} rows)")
# Report results
if verbose:
total_rows = sum(len(df) for df in all_dfs)
print(f"Complete: {len(all_dfs)} chunks with data, {len(empty_chunks)} empty chunks, {len(failed_chunks)} failed chunks")
print(f"Total rows downloaded: {total_rows:,}")
if len(failed_chunks) > 0:
print(f"Warning: {len(failed_chunks)} chunks failed due to errors")
# Combine all dataframes
if all_dfs:
combined_df = pd.concat(all_dfs, ignore_index=True)
# Sort by time if columns exist
if 'muts' in combined_df.columns and '_seq' in combined_df.columns:
combined_df = combined_df.sort_values(['muts', '_seq'])
combined_df.reset_index(drop=True, inplace=True)
elif 'timestamp' in combined_df.columns:
combined_df = combined_df.sort_values('timestamp')
combined_df.reset_index(drop=True, inplace=True)
return combined_df
else:
print("No data downloaded successfully")
return pd.DataFrame()
def _parse_datetime(dt_input: Union[str, datetime]) -> datetime:
"""Parse string or datetime input into datetime object."""
if isinstance(dt_input, datetime):
return dt_input
dt_str = str(dt_input).strip()
# Try different datetime formats
formats = [
'%Y-%m-%d %H:%M:%S.%f', # Full datetime with microseconds
'%Y-%m-%d %H:%M:%S', # Full datetime
'%Y-%m-%d %H:%M', # Date with hour:minute
'%Y-%m-%d', # Date only
]
for fmt in formats:
try:
return datetime.strptime(dt_str, fmt)
except ValueError:
continue
raise ValueError(f"Unable to parse datetime: {dt_input}")
def _generate_time_chunks(start_dt: datetime, end_dt: datetime, chunk_minutes: int) -> List[tuple]:
"""Generate list of (start, end) datetime tuples for chunks."""
chunks = []
current_start = start_dt
chunk_delta = timedelta(minutes=chunk_minutes)
while current_start < end_dt:
current_end = min(current_start + chunk_delta, end_dt)
chunks.append((current_start, current_end))
current_start = current_end
return chunks
def _download_chunk_with_retry(dataset: str,
start_time: datetime,
end_time: datetime,
symbols: List[str],
max_retries: int,
retry_delay: float,
verbose: bool = True) -> Optional[pd.DataFrame]:
"""Download a single time chunk with retry logic."""
# Format times as strings for liberator
start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')
end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')
for attempt in range(max_retries):
try:
# Query the data
query_result = liberator.query(
name=dataset,
symbols=symbols,
back_to=start_str,
as_of=end_str
)
# Convert to dataframe
df = liberator.get_dataframe(query_result)
# Return the dataframe (could be empty, but that's not a failure)
return df
except Exception as e:
if attempt < max_retries - 1: # Don't sleep on last attempt
time.sleep(retry_delay)
else:
if verbose:
print(f" Failed chunk {start_time.strftime('%H:%M')}-{end_time.strftime('%H:%M')} after {max_retries} attempts: {str(e)}")
return None # Return None only on actual exception
return None
# Example usage functions
def download_single_day(dataset: str, date: str, symbols: Union[str, List[str]], **kwargs) -> pd.DataFrame:
"""
Convenience function to download a single day's data.
Args:
dataset: Dataset name
date: Date as 'YYYY-MM-DD'
symbols: Symbol(s) to download
**kwargs: Additional arguments passed to getdata()
Returns:
pandas.DataFrame: Day's data
"""
start_date = f"{date} 00:00:00"
end_date = f"{date} 23:59:59"
return getdata(
dataset=dataset,
start_date=start_date,
end_date=end_date,
symbols=symbols,
**kwargs
)
def download_date_range(dataset: str,
start_date: str,
end_date: str,
symbols: Union[str, List[str]],
**kwargs) -> pd.DataFrame:
"""
Download data across multiple days.
Args:
dataset: Dataset name
start_date: Start date as 'YYYY-MM-DD'
end_date: End date as 'YYYY-MM-DD'
symbols: Symbol(s) to download
**kwargs: Additional arguments passed to getdata()
Returns:
pandas.DataFrame: Multi-day data
"""
start_dt = _parse_datetime(f"{start_date} 00:00:00")
end_dt = _parse_datetime(f"{end_date} 23:59:59.999")
return getdata(
dataset=dataset,
start_date=start_dt,
end_date=end_dt,
symbols=symbols,
**kwargs
)
Usage Examples
Single Day, Multiple Symbols
df1 = download_single_day(
dataset='spiderrock_printsets_indexed_single_tenant',
date='2025-09-08',
symbols=['AAPL', 'GOOGL', 'MSFT']
)
Custom Time Range
df2 = getdata(
dataset='spiderrock_printsets_indexed_single_tenant',
start_date='2025-09-08 09:30:00',
end_date='2025-09-08 16:00:00',
symbols='AAPL',
chunk_minutes=15,
verbose=False
)
Multi-Day Download
df3 = download_date_range(
dataset='spiderrock_printsets_indexed_single_tenant',
start_date='2025-09-03',
end_date='2025-09-06',
symbols=['AAPL', 'GOOGL'],
chunk_minutes=120 # 2-hour chunks for longer periods
)
Sample Output
Downloading spiderrock_printsets_indexed_single_tenant from 2025-09-03 00:00:00
to 2025-09-06 23:59:59.999000 for 2 symbols
Using 120-minute chunks
Processing 48 time chunks...
Processing chunk 1/48 (00:00 - 02:00) - EMPTY
Processing chunk 2/48 (02:00 - 04:00) - EMPTY
Processing chunk 3/48 (04:00 - 06:00) - EMPTY
Processing chunk 4/48 (06:00 - 08:00) - EMPTY
Processing chunk 5/48 (08:00 - 10:00) - SUCCESS (143,762 rows)
Processing chunk 6/48 (10:00 - 12:00) - SUCCESS (168,790 rows)
...
Complete: 13 chunks with data, 35 empty chunks, 0 failed chunks
Total rows downloaded: 959,576
For multi-day downloads, increase
chunk_minutes to reduce the number of API calls. A value of 120 (2 hours) works well for longer date ranges.The chunking approach is designed for network resilience. If you are on a stable connection and downloading moderately sized datasets, a single query may be simpler and faster.

