1. Support Center
  2. Python for Data Science Recipes

Downloading Very Large Datasets

For Large Time Series datasets, users with potential network instability would be advised to CHUNK large queries into smaller time segments .

The attached code breaks large queries into smaller time-based chunks (typically 30-minute segments), which:

  • Reduces the likelihood of network timeouts and corruption
  • Automatically retries failed chunks without losing progress
  • Continues downloading even if individual chunks fail
  • Provides clear feedback on download progress

 

import liberator
import pandas as pd
from datetime import datetime, timedelta
from typing import Union, List, Optional
import time

def getdata(dataset: str, 
           start_date: Union[str, datetime], 
           end_date: Union[str, datetime], 
           symbols: Union[str, List[str]], 
           chunk_minutes: int = 30,
           max_retries: int = 3,
           retry_delay: float = 1.0,
           verbose: bool = True) -> pd.DataFrame:
    """
    Download data by breaking the time range into smaller chunks.
    
    Args:
        dataset: Dataset name (e.g., 'spiderrock_printsets_indexed_single_tenant')
        start_date: Start date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
        end_date: End date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
        symbols: Single symbol string or list of symbols
        chunk_minutes: Minutes per chunk (default 30)
        max_retries: Maximum retries per chunk (default 3)
        retry_delay: Delay between retries in seconds (default 1.0)
        verbose: Print progress messages (default True)
    
    Returns:
        pandas.DataFrame: Combined data for the entire time range
    """
    
    # Parse and validate inputs
    start_dt = _parse_datetime(start_date)
    end_dt = _parse_datetime(end_date)
    
    if isinstance(symbols, str):
        symbols = [symbols]
    
    if verbose:
        print(f"Downloading {dataset} from {start_dt} to {end_dt} for {len(symbols)} symbols")
        print(f"Using {chunk_minutes}-minute chunks")
    
    # Generate time chunks
    chunks = _generate_time_chunks(start_dt, end_dt, chunk_minutes)
    
    if verbose:
        print(f"Processing {len(chunks)} time chunks...")
    
    # Download each chunk
    all_dfs = []
    failed_chunks = []
    empty_chunks = []
    
    for i, (chunk_start, chunk_end) in enumerate(chunks):
        if verbose:  # Progress on every chunk
            # Show actual end time, not just hour:minute if it's not on a clean boundary
            end_display = chunk_end.strftime('%H:%M:%S') if chunk_end.second != 0 else chunk_end.strftime('%H:%M')
            print(f"  Processing chunk {i+1}/{len(chunks)} ({chunk_start.strftime('%H:%M')} - {end_display})", end="")
        
        # Download chunk with retries
        result = _download_chunk_with_retry(
            dataset=dataset,
            start_time=chunk_start,
            end_time=chunk_end,
            symbols=symbols,
            max_retries=max_retries,
            retry_delay=retry_delay,
            verbose=verbose
        )
        
        if result is None:
            # Actual failure (exception occurred)
            failed_chunks.append((chunk_start, chunk_end))
            if verbose:
                print(" - FAILED")
        elif len(result) == 0:
            # Empty result (no data for this time period)
            empty_chunks.append((chunk_start, chunk_end))
            if verbose:
                print(" - EMPTY")
        else:
            # Success with data
            all_dfs.append(result)
            if verbose:
                print(f" - SUCCESS ({len(result):,} rows)")
    
    # Report results
    if verbose:
        total_rows = sum(len(df) for df in all_dfs)
        print(f"Complete: {len(all_dfs)} chunks with data, {len(empty_chunks)} empty chunks, {len(failed_chunks)} failed chunks")
        print(f"Total rows downloaded: {total_rows:,}")
        
        if len(failed_chunks) > 0:
            print(f"Warning: {len(failed_chunks)} chunks failed due to errors")
    
    # Combine all dataframes
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        
        # Sort by time if columns exist
        if 'muts' in combined_df.columns and '_seq' in combined_df.columns:
            combined_df = combined_df.sort_values(['muts', '_seq'])
            combined_df.reset_index(drop=True, inplace=True)
        elif 'timestamp' in combined_df.columns:
            combined_df = combined_df.sort_values('timestamp')
            combined_df.reset_index(drop=True, inplace=True)
        
        return combined_df
    else:
        print("No data downloaded successfully")
        return pd.DataFrame()


def _parse_datetime(dt_input: Union[str, datetime]) -> datetime:
    """Parse string or datetime input into datetime object."""
    if isinstance(dt_input, datetime):
        return dt_input
    
    dt_str = str(dt_input).strip()
    
    # Try different datetime formats
    formats = [
        '%Y-%m-%d %H:%M:%S.%f',  # Full datetime with microseconds
        '%Y-%m-%d %H:%M:%S',     # Full datetime
        '%Y-%m-%d %H:%M',        # Date with hour:minute
        '%Y-%m-%d',              # Date only
    ]
    
    for fmt in formats:
        try:
            return datetime.strptime(dt_str, fmt)
        except ValueError:
            continue
    
    raise ValueError(f"Unable to parse datetime: {dt_input}")


def _generate_time_chunks(start_dt: datetime, end_dt: datetime, chunk_minutes: int) -> List[tuple]:
    """Generate list of (start, end) datetime tuples for chunks."""
    chunks = []
    current_start = start_dt
    chunk_delta = timedelta(minutes=chunk_minutes)
    
    while current_start < end_dt:
        current_end = min(current_start + chunk_delta, end_dt)
        chunks.append((current_start, current_end))
        current_start = current_end
    
    return chunks


def _download_chunk_with_retry(dataset: str, 
                              start_time: datetime, 
                              end_time: datetime, 
                              symbols: List[str], 
                              max_retries: int, 
                              retry_delay: float,
                              verbose: bool = True) -> Optional[pd.DataFrame]:
    """Download a single time chunk with retry logic."""
    
    # Format times as strings for liberator
    start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')
    end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')
    
    for attempt in range(max_retries):
        try:
            # Query the data
            query_result = liberator.query(
                name=dataset,
                symbols=symbols,
                back_to=start_str,
                as_of=end_str
            )
            
            # Convert to dataframe
            df = liberator.get_dataframe(query_result)
            
            # Return the dataframe (could be empty, but that's not a failure)
            return df
            
        except Exception as e:
            if attempt < max_retries - 1:  # Don't sleep on last attempt
                time.sleep(retry_delay)
            else:
                if verbose:
                    print(f"    Failed chunk {start_time.strftime('%H:%M')}-{end_time.strftime('%H:%M')} after {max_retries} attempts: {str(e)}")
                return None  # Return None only on actual exception
    
    return None


# Example usage functions
def download_single_day(dataset: str, date: str, symbols: Union[str, List[str]], **kwargs) -> pd.DataFrame:
    """
    Convenience function to download a single day's data.
    
    Args:
        dataset: Dataset name
        date: Date as 'YYYY-MM-DD'
        symbols: Symbol(s) to download
        **kwargs: Additional arguments passed to getdata()
    
    Returns:
        pandas.DataFrame: Day's data
    """
    start_date = f"{date} 00:00:00"
    end_date = f"{date} 23:59:59"
    
    return getdata(
        dataset=dataset,
        start_date=start_date,
        end_date=end_date,
        symbols=symbols,
        **kwargs
    )


def download_date_range(dataset: str, 
                       start_date: str, 
                       end_date: str, 
                       symbols: Union[str, List[str]], 
                       **kwargs) -> pd.DataFrame:
    """
    Download data across multiple days.
    
    Args:
        dataset: Dataset name  
        start_date: Start date as 'YYYY-MM-DD'
        end_date: End date as 'YYYY-MM-DD'
        symbols: Symbol(s) to download
        **kwargs: Additional arguments passed to getdata()
    
    Returns:
        pandas.DataFrame: Multi-day data
    """
    start_dt = _parse_datetime(f"{start_date} 00:00:00")
    end_dt = _parse_datetime(f"{end_date} 23:59:59.999")  # Explicit milliseconds
    
    return getdata(
        dataset=dataset,
        start_date=start_dt,
        end_date=end_dt,
        symbols=symbols,
        **kwargs
    )

Here are some example calls...

# Example 1: Single day, multiple symbols
df1 = download_single_day(
    dataset='spiderrock_printsets_indexed_single_tenant',
    date='2025-09-08',
    symbols=['AAPL', 'GOOGL', 'MSFT']
)
# Example 2: Custom time range with verbose off
df2 = getdata(
    dataset='spiderrock_printsets_indexed_single_tenant',
    start_date='2025-09-08 09:30:00',
    end_date='2025-09-08 16:00:00',
    symbols='AAPL',
    chunk_minutes=15,
    verbose=False  # Silent operation
    )
# Example 3: Multi-day download
df3 = download_date_range(
    dataset='spiderrock_printsets_indexed_single_tenant',
    start_date='2025-09-03',
    end_date='2025-09-06',
    symbols=['AAPL', 'GOOGL'],
    chunk_minutes=120  # 2-hour chunks for longer periods
)

Sample Output from Example 3

Downloading spiderrock_printsets_indexed_single_tenant from 2025-09-03 00:00:00 
to 2025-09-06 23:59:59.999000 for 2 symbols
Using 120-minute chunks
Processing 48 time chunks...
Processing chunk 1/48 (00:00 - 02:00) - EMPTY
Processing chunk 2/48 (02:00 - 04:00) - EMPTY
Processing chunk 3/48 (04:00 - 06:00) - EMPTY
Processing chunk 4/48 (06:00 - 08:00) - EMPTY
Processing chunk 5/48 (08:00 - 10:00) - SUCCESS (143,762 rows)
Processing chunk 6/48 (10:00 - 12:00) - SUCCESS (168,790 rows)
Processing chunk 7/48 (12:00 - 14:00) - SUCCESS (72,840 rows)
Processing chunk 8/48 (14:00 - 16:00) - SUCCESS (69,498 rows)
Processing chunk 9/48 (16:00 - 18:00) - EMPTY
Processing chunk 10/48 (18:00 - 20:00) - EMPTY
Processing chunk 11/48 (20:00 - 22:00) - EMPTY
Processing chunk 12/48 (22:00 - 00:00) - EMPTY
Processing chunk 13/48 (00:00 - 02:00) - EMPTY
Processing chunk 14/48 (02:00 - 04:00) - EMPTY
Processing chunk 15/48 (04:00 - 06:00) - EMPTY
Processing chunk 16/48 (06:00 - 08:00) - EMPTY
Processing chunk 17/48 (08:00 - 10:00) - SUCCESS (59,208 rows)
Processing chunk 18/48 (10:00 - 12:00) - SUCCESS (81,511 rows)
Processing chunk 19/48 (12:00 - 14:00) - SUCCESS (41,247 rows)
Processing chunk 20/48 (14:00 - 16:00) - SUCCESS (69,774 rows)
Processing chunk 21/48 (16:00 - 18:00) - SUCCESS (2 rows)
Processing chunk 22/48 (18:00 - 20:00) - EMPTY
Processing chunk 23/48 (20:00 - 22:00) - EMPTY
Processing chunk 24/48 (22:00 - 00:00) - EMPTY
Processing chunk 25/48 (00:00 - 02:00) - EMPTY
Processing chunk 26/48 (02:00 - 04:00) - EMPTY
Processing chunk 27/48 (04:00 - 06:00) - EMPTY
Processing chunk 28/48 (06:00 - 08:00) - EMPTY
Processing chunk 29/48 (08:00 - 10:00) - SUCCESS (61,492 rows)
Processing chunk 30/48 (10:00 - 12:00) - SUCCESS (96,447 rows)
Processing chunk 31/48 (12:00 - 14:00) - SUCCESS (43,307 rows)
Processing chunk 32/48 (14:00 - 16:00) - SUCCESS (51,698 rows)
Processing chunk 33/48 (16:00 - 18:00) - EMPTY
Processing chunk 34/48 (18:00 - 20:00) - EMPTY
Processing chunk 35/48 (20:00 - 22:00) - EMPTY
Processing chunk 36/48 (22:00 - 00:00) - EMPTY
Processing chunk 37/48 (00:00 - 02:00) - EMPTY
Processing chunk 38/48 (02:00 - 04:00) - EMPTY
Processing chunk 39/48 (04:00 - 06:00) - EMPTY
Processing chunk 40/48 (06:00 - 08:00) - EMPTY
Processing chunk 41/48 (08:00 - 10:00) - EMPTY
Processing chunk 42/48 (10:00 - 12:00) - EMPTY
Processing chunk 43/48 (12:00 - 14:00) - EMPTY
Processing chunk 44/48 (14:00 - 16:00) - EMPTY
Processing chunk 45/48 (16:00 - 18:00) - EMPTY
Processing chunk 46/48 (18:00 - 20:00) - EMPTY
Processing chunk 47/48 (20:00 - 22:00) - EMPTY
Processing chunk 48/48 (22:00 - 23:59:59) - EMPTY
Complete: 13 chunks with data, 35 empty chunks, 0 failed chunks
Total rows downloaded: 959,576