> ## Documentation Index
> Fetch the complete documentation index at: https://knowledge.cloudquant.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Downloading Very Large Datasets

> Chunk large time series queries into smaller segments to handle network instability and avoid timeouts when downloading market data.

# Downloading very large datasets

For large time series datasets, if you experience network instability, chunk large queries into smaller time segments rather than attempting single monolithic downloads.

## Core strategy

The recommended approach divides large queries into smaller, manageable time-based chunks (typically 30-minute segments). This methodology:

* Reduces network timeout and data corruption likelihood
* Automatically retries failed chunks without losing progress
* Continues downloading even when individual chunks fail
* Provides clear feedback on download progress

## Complete Python implementation

```python theme={null}
import liberator
import pandas as pd
from datetime import datetime, timedelta
from typing import Union, List, Optional
import time


def getdata(dataset: str,
           start_date: Union[str, datetime],
           end_date: Union[str, datetime],
           symbols: Union[str, List[str]],
           chunk_minutes: int = 30,
           max_retries: int = 3,
           retry_delay: float = 1.0,
           verbose: bool = True) -> pd.DataFrame:
    """
    Download data by breaking the time range into smaller chunks.

    Args:
        dataset: Dataset name (e.g., 'spiderrock_printsets_indexed_single_tenant')
        start_date: Start date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
        end_date: End date/time as string 'YYYY-MM-DD [HH:MM:SS]' or datetime
        symbols: Single symbol string or list of symbols
        chunk_minutes: Minutes per chunk (default 30)
        max_retries: Maximum retries per chunk (default 3)
        retry_delay: Delay between retries in seconds (default 1.0)
        verbose: Print progress messages (default True)

    Returns:
        pandas.DataFrame: Combined data for the entire time range
    """

    # Parse and validate inputs
    start_dt = _parse_datetime(start_date)
    end_dt = _parse_datetime(end_date)

    if isinstance(symbols, str):
        symbols = [symbols]

    if verbose:
        print(f"Downloading {dataset} from {start_dt} to {end_dt} for {len(symbols)} symbols")
        print(f"Using {chunk_minutes}-minute chunks")

    # Generate time chunks
    chunks = _generate_time_chunks(start_dt, end_dt, chunk_minutes)

    if verbose:
        print(f"Processing {len(chunks)} time chunks...")

    # Download each chunk
    all_dfs = []
    failed_chunks = []
    empty_chunks = []

    for i, (chunk_start, chunk_end) in enumerate(chunks):
        if verbose:
            end_display = chunk_end.strftime('%H:%M:%S') if chunk_end.second != 0 else chunk_end.strftime('%H:%M')
            print(f"  Processing chunk {i+1}/{len(chunks)} ({chunk_start.strftime('%H:%M')} - {end_display})", end="")

        # Download chunk with retries
        result = _download_chunk_with_retry(
            dataset=dataset,
            start_time=chunk_start,
            end_time=chunk_end,
            symbols=symbols,
            max_retries=max_retries,
            retry_delay=retry_delay,
            verbose=verbose
        )

        if result is None:
            # Actual failure (exception occurred)
            failed_chunks.append((chunk_start, chunk_end))
            if verbose:
                print(" - FAILED")
        elif len(result) == 0:
            # Empty result (no data for this time period)
            empty_chunks.append((chunk_start, chunk_end))
            if verbose:
                print(" - EMPTY")
        else:
            # Success with data
            all_dfs.append(result)
            if verbose:
                print(f" - SUCCESS ({len(result):,} rows)")

    # Report results
    if verbose:
        total_rows = sum(len(df) for df in all_dfs)
        print(f"Complete: {len(all_dfs)} chunks with data, {len(empty_chunks)} empty chunks, {len(failed_chunks)} failed chunks")
        print(f"Total rows downloaded: {total_rows:,}")

        if len(failed_chunks) > 0:
            print(f"Warning: {len(failed_chunks)} chunks failed due to errors")

    # Combine all dataframes
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)

        # Sort by time if columns exist
        if 'muts' in combined_df.columns and '_seq' in combined_df.columns:
            combined_df = combined_df.sort_values(['muts', '_seq'])
            combined_df.reset_index(drop=True, inplace=True)
        elif 'timestamp' in combined_df.columns:
            combined_df = combined_df.sort_values('timestamp')
            combined_df.reset_index(drop=True, inplace=True)

        return combined_df
    else:
        print("No data downloaded successfully")
        return pd.DataFrame()


def _parse_datetime(dt_input: Union[str, datetime]) -> datetime:
    """Parse string or datetime input into datetime object."""
    if isinstance(dt_input, datetime):
        return dt_input

    dt_str = str(dt_input).strip()

    # Try different datetime formats
    formats = [
        '%Y-%m-%d %H:%M:%S.%f',  # Full datetime with microseconds
        '%Y-%m-%d %H:%M:%S',     # Full datetime
        '%Y-%m-%d %H:%M',        # Date with hour:minute
        '%Y-%m-%d',              # Date only
    ]

    for fmt in formats:
        try:
            return datetime.strptime(dt_str, fmt)
        except ValueError:
            continue

    raise ValueError(f"Unable to parse datetime: {dt_input}")


def _generate_time_chunks(start_dt: datetime, end_dt: datetime, chunk_minutes: int) -> List[tuple]:
    """Generate list of (start, end) datetime tuples for chunks."""
    chunks = []
    current_start = start_dt
    chunk_delta = timedelta(minutes=chunk_minutes)

    while current_start < end_dt:
        current_end = min(current_start + chunk_delta, end_dt)
        chunks.append((current_start, current_end))
        current_start = current_end

    return chunks


def _download_chunk_with_retry(dataset: str,
                              start_time: datetime,
                              end_time: datetime,
                              symbols: List[str],
                              max_retries: int,
                              retry_delay: float,
                              verbose: bool = True) -> Optional[pd.DataFrame]:
    """Download a single time chunk with retry logic."""

    # Format times as strings for liberator
    start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')
    end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')

    for attempt in range(max_retries):
        try:
            # Query the data
            query_result = liberator.query(
                name=dataset,
                symbols=symbols,
                back_to=start_str,
                as_of=end_str
            )

            # Convert to dataframe
            df = liberator.get_dataframe(query_result)

            # Return the dataframe (could be empty, but that's not a failure)
            return df

        except Exception as e:
            if attempt < max_retries - 1:  # Don't sleep on last attempt
                time.sleep(retry_delay)
            else:
                if verbose:
                    print(f"    Failed chunk {start_time.strftime('%H:%M')}-{end_time.strftime('%H:%M')} after {max_retries} attempts: {str(e)}")
                return None  # Return None only on actual exception

    return None


# Example usage functions
def download_single_day(dataset: str, date: str, symbols: Union[str, List[str]], **kwargs) -> pd.DataFrame:
    """
    Convenience function to download a single day's data.

    Args:
        dataset: Dataset name
        date: Date as 'YYYY-MM-DD'
        symbols: Symbol(s) to download
        **kwargs: Additional arguments passed to getdata()

    Returns:
        pandas.DataFrame: Day's data
    """
    start_date = f"{date} 00:00:00"
    end_date = f"{date} 23:59:59"

    return getdata(
        dataset=dataset,
        start_date=start_date,
        end_date=end_date,
        symbols=symbols,
        **kwargs
    )


def download_date_range(dataset: str,
                       start_date: str,
                       end_date: str,
                       symbols: Union[str, List[str]],
                       **kwargs) -> pd.DataFrame:
    """
    Download data across multiple days.

    Args:
        dataset: Dataset name
        start_date: Start date as 'YYYY-MM-DD'
        end_date: End date as 'YYYY-MM-DD'
        symbols: Symbol(s) to download
        **kwargs: Additional arguments passed to getdata()

    Returns:
        pandas.DataFrame: Multi-day data
    """
    start_dt = _parse_datetime(f"{start_date} 00:00:00")
    end_dt = _parse_datetime(f"{end_date} 23:59:59.999")

    return getdata(
        dataset=dataset,
        start_date=start_dt,
        end_date=end_dt,
        symbols=symbols,
        **kwargs
    )
```

## Usage examples

### Single day, multiple symbols

```python theme={null}
df1 = download_single_day(
    dataset='spiderrock_printsets_indexed_single_tenant',
    date='2025-09-08',
    symbols=['AAPL', 'GOOGL', 'MSFT']
)
```

### Custom time range

```python theme={null}
df2 = getdata(
    dataset='spiderrock_printsets_indexed_single_tenant',
    start_date='2025-09-08 09:30:00',
    end_date='2025-09-08 16:00:00',
    symbols='AAPL',
    chunk_minutes=15,
    verbose=False
)
```

### Multi-day download

```python theme={null}
df3 = download_date_range(
    dataset='spiderrock_printsets_indexed_single_tenant',
    start_date='2025-09-03',
    end_date='2025-09-06',
    symbols=['AAPL', 'GOOGL'],
    chunk_minutes=120  # 2-hour chunks for longer periods
)
```

## Sample output

```
Downloading spiderrock_printsets_indexed_single_tenant from 2025-09-03 00:00:00
to 2025-09-06 23:59:59.999000 for 2 symbols
Using 120-minute chunks
Processing 48 time chunks...
  Processing chunk 1/48 (00:00 - 02:00) - EMPTY
  Processing chunk 2/48 (02:00 - 04:00) - EMPTY
  Processing chunk 3/48 (04:00 - 06:00) - EMPTY
  Processing chunk 4/48 (06:00 - 08:00) - EMPTY
  Processing chunk 5/48 (08:00 - 10:00) - SUCCESS (143,762 rows)
  Processing chunk 6/48 (10:00 - 12:00) - SUCCESS (168,790 rows)
  ...
Complete: 13 chunks with data, 35 empty chunks, 0 failed chunks
Total rows downloaded: 959,576
```

<Tip>
  For multi-day downloads, increase `chunk_minutes` to reduce the number of API calls. A value of 120 (2 hours) works well for longer date ranges.
</Tip>

<Warning>
  The chunking approach is designed for network resilience. If you are on a stable connection and downloading moderately sized datasets, a single query may be simpler and faster.
</Warning>
