Skip to main content

Downloading Very Large Datasets

For large time series datasets, users experiencing network instability should chunk large queries into smaller time segments rather than attempting single monolithic downloads.

Core Strategy

The recommended approach divides large queries into smaller, manageable time-based chunks (typically 30-minute segments). This methodology:
  • Reduces network timeout and data corruption likelihood
  • Automatically retries failed chunks without losing progress
  • Continues downloading even when individual chunks fail
  • Provides clear feedback on download progress

Complete Python Implementation

import liberator
import pandas as pd
from datetime import datetime, timedelta
from typing import Union, List, Optional
import time


def getdata(dataset: str,
           start_date: Union[str, datetime],
           end_date: Union[str, datetime],
           symbols: Union[str, List[str]],
           chunk_minutes: int = 30,
           max_retries: int = 3,
           retry_delay: float = 1.0,
           verbose: bool = True) -> pd.DataFrame:
    """
    Download data by breaking the time range into smaller chunks.

    Args:
        dataset: Dataset name (e.g., 'spiderrock_printsets_indexed_single_tenant')
        start_date: Start date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
        end_date: End date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
        symbols: Single symbol string or list of symbols
        chunk_minutes: Minutes per chunk (default 30)
        max_retries: Maximum retries per chunk (default 3)
        retry_delay: Delay between retries in seconds (default 1.0)
        verbose: Print progress messages (default True)

    Returns:
        pandas.DataFrame: Combined data for the entire time range
    """

    # Parse and validate inputs
    start_dt = _parse_datetime(start_date)
    end_dt = _parse_datetime(end_date)

    if isinstance(symbols, str):
        symbols = [symbols]

    if verbose:
        print(f"Downloading {dataset} from {start_dt} to {end_dt} for {len(symbols)} symbols")
        print(f"Using {chunk_minutes}-minute chunks")

    # Generate time chunks
    chunks = _generate_time_chunks(start_dt, end_dt, chunk_minutes)

    if verbose:
        print(f"Processing {len(chunks)} time chunks...")

    # Download each chunk
    all_dfs = []
    failed_chunks = []
    empty_chunks = []

    for i, (chunk_start, chunk_end) in enumerate(chunks):
        if verbose:
            end_display = chunk_end.strftime('%H:%M:%S') if chunk_end.second != 0 else chunk_end.strftime('%H:%M')
            print(f"  Processing chunk {i+1}/{len(chunks)} ({chunk_start.strftime('%H:%M')} - {end_display})", end="")

        # Download chunk with retries
        result = _download_chunk_with_retry(
            dataset=dataset,
            start_time=chunk_start,
            end_time=chunk_end,
            symbols=symbols,
            max_retries=max_retries,
            retry_delay=retry_delay,
            verbose=verbose
        )

        if result is None:
            # Actual failure (exception occurred)
            failed_chunks.append((chunk_start, chunk_end))
            if verbose:
                print(" - FAILED")
        elif len(result) == 0:
            # Empty result (no data for this time period)
            empty_chunks.append((chunk_start, chunk_end))
            if verbose:
                print(" - EMPTY")
        else:
            # Success with data
            all_dfs.append(result)
            if verbose:
                print(f" - SUCCESS ({len(result):,} rows)")

    # Report results
    if verbose:
        total_rows = sum(len(df) for df in all_dfs)
        print(f"Complete: {len(all_dfs)} chunks with data, {len(empty_chunks)} empty chunks, {len(failed_chunks)} failed chunks")
        print(f"Total rows downloaded: {total_rows:,}")

        if len(failed_chunks) > 0:
            print(f"Warning: {len(failed_chunks)} chunks failed due to errors")

    # Combine all dataframes
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)

        # Sort by time if columns exist
        if 'muts' in combined_df.columns and '_seq' in combined_df.columns:
            combined_df = combined_df.sort_values(['muts', '_seq'])
            combined_df.reset_index(drop=True, inplace=True)
        elif 'timestamp' in combined_df.columns:
            combined_df = combined_df.sort_values('timestamp')
            combined_df.reset_index(drop=True, inplace=True)

        return combined_df
    else:
        print("No data downloaded successfully")
        return pd.DataFrame()


def _parse_datetime(dt_input: Union[str, datetime]) -> datetime:
    """Parse string or datetime input into datetime object."""
    if isinstance(dt_input, datetime):
        return dt_input

    dt_str = str(dt_input).strip()

    # Try different datetime formats
    formats = [
        '%Y-%m-%d %H:%M:%S.%f',  # Full datetime with microseconds
        '%Y-%m-%d %H:%M:%S',     # Full datetime
        '%Y-%m-%d %H:%M',        # Date with hour:minute
        '%Y-%m-%d',              # Date only
    ]

    for fmt in formats:
        try:
            return datetime.strptime(dt_str, fmt)
        except ValueError:
            continue

    raise ValueError(f"Unable to parse datetime: {dt_input}")


def _generate_time_chunks(start_dt: datetime, end_dt: datetime, chunk_minutes: int) -> List[tuple]:
    """Generate list of (start, end) datetime tuples for chunks."""
    chunks = []
    current_start = start_dt
    chunk_delta = timedelta(minutes=chunk_minutes)

    while current_start < end_dt:
        current_end = min(current_start + chunk_delta, end_dt)
        chunks.append((current_start, current_end))
        current_start = current_end

    return chunks


def _download_chunk_with_retry(dataset: str,
                              start_time: datetime,
                              end_time: datetime,
                              symbols: List[str],
                              max_retries: int,
                              retry_delay: float,
                              verbose: bool = True) -> Optional[pd.DataFrame]:
    """Download a single time chunk with retry logic."""

    # Format times as strings for liberator
    start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')
    end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')

    for attempt in range(max_retries):
        try:
            # Query the data
            query_result = liberator.query(
                name=dataset,
                symbols=symbols,
                back_to=start_str,
                as_of=end_str
            )

            # Convert to dataframe
            df = liberator.get_dataframe(query_result)

            # Return the dataframe (could be empty, but that's not a failure)
            return df

        except Exception as e:
            if attempt < max_retries - 1:  # Don't sleep on last attempt
                time.sleep(retry_delay)
            else:
                if verbose:
                    print(f"    Failed chunk {start_time.strftime('%H:%M')}-{end_time.strftime('%H:%M')} after {max_retries} attempts: {str(e)}")
                return None  # Return None only on actual exception

    return None


# Example usage functions
def download_single_day(dataset: str, date: str, symbols: Union[str, List[str]], **kwargs) -> pd.DataFrame:
    """
    Convenience function to download a single day's data.

    Args:
        dataset: Dataset name
        date: Date as 'YYYY-MM-DD'
        symbols: Symbol(s) to download
        **kwargs: Additional arguments passed to getdata()

    Returns:
        pandas.DataFrame: Day's data
    """
    start_date = f"{date} 00:00:00"
    end_date = f"{date} 23:59:59"

    return getdata(
        dataset=dataset,
        start_date=start_date,
        end_date=end_date,
        symbols=symbols,
        **kwargs
    )


def download_date_range(dataset: str,
                       start_date: str,
                       end_date: str,
                       symbols: Union[str, List[str]],
                       **kwargs) -> pd.DataFrame:
    """
    Download data across multiple days.

    Args:
        dataset: Dataset name
        start_date: Start date as 'YYYY-MM-DD'
        end_date: End date as 'YYYY-MM-DD'
        symbols: Symbol(s) to download
        **kwargs: Additional arguments passed to getdata()

    Returns:
        pandas.DataFrame: Multi-day data
    """
    start_dt = _parse_datetime(f"{start_date} 00:00:00")
    end_dt = _parse_datetime(f"{end_date} 23:59:59.999")

    return getdata(
        dataset=dataset,
        start_date=start_dt,
        end_date=end_dt,
        symbols=symbols,
        **kwargs
    )

Usage Examples

Single Day, Multiple Symbols

df1 = download_single_day(
    dataset='spiderrock_printsets_indexed_single_tenant',
    date='2025-09-08',
    symbols=['AAPL', 'GOOGL', 'MSFT']
)

Custom Time Range

df2 = getdata(
    dataset='spiderrock_printsets_indexed_single_tenant',
    start_date='2025-09-08 09:30:00',
    end_date='2025-09-08 16:00:00',
    symbols='AAPL',
    chunk_minutes=15,
    verbose=False
)

Multi-Day Download

df3 = download_date_range(
    dataset='spiderrock_printsets_indexed_single_tenant',
    start_date='2025-09-03',
    end_date='2025-09-06',
    symbols=['AAPL', 'GOOGL'],
    chunk_minutes=120  # 2-hour chunks for longer periods
)

Sample Output

Downloading spiderrock_printsets_indexed_single_tenant from 2025-09-03 00:00:00
to 2025-09-06 23:59:59.999000 for 2 symbols
Using 120-minute chunks
Processing 48 time chunks...
  Processing chunk 1/48 (00:00 - 02:00) - EMPTY
  Processing chunk 2/48 (02:00 - 04:00) - EMPTY
  Processing chunk 3/48 (04:00 - 06:00) - EMPTY
  Processing chunk 4/48 (06:00 - 08:00) - EMPTY
  Processing chunk 5/48 (08:00 - 10:00) - SUCCESS (143,762 rows)
  Processing chunk 6/48 (10:00 - 12:00) - SUCCESS (168,790 rows)
  ...
Complete: 13 chunks with data, 35 empty chunks, 0 failed chunks
Total rows downloaded: 959,576
For multi-day downloads, increase chunk_minutes to reduce the number of API calls. A value of 120 (2 hours) works well for longer date ranges.
The chunking approach is designed for network resilience. If you are on a stable connection and downloading moderately sized datasets, a single query may be simpler and faster.