import os
from internetarchive import search_items, download

def download_pdfs_from_collection(
    collection,
    output_dir='downloads',
    start_year=None,
    end_year=None,
    topic=None,
    max_items=None
):
    """
    Download PDFs from a specified archive.org collection.
    
    Args:
        collection (str): The collection identifier on archive.org (e.g., 'opensource')
        output_dir (str): Directory to save downloaded PDFs
        start_year (str/int): Start year for filtering items (e.g., '2000')
        end_year (str/int): End year for filtering items (e.g., '2020')
        topic (str): Subject/topic keyword filter
        max_items (int): Max number of items to download (optional)
    """
    query_parts = [f'collection:{collection}', 'format:PDF']
    
    if topic:
        # 'subject' is a common metadata field for topics
        query_parts.append(f'subject:"{topic}"')
    
    if start_year or end_year:
        sy = start_year or '0000'
        ey = end_year or '9999'
        query_parts.append(f'year:[{sy} TO {ey}]')
    
    query = ' AND '.join(query_parts)
    
    print("Searching Archive.org with query:", query)
    
    os.makedirs(output_dir, exist_ok=True)
    
    count = 0
    for result in search_items(query):
        if max_items and count >= max_items:
            break
        identifier = result['identifier']
        print(f'Downloading PDFs for item: {identifier}')
        # Only download PDF files
        download(identifier, destdir=output_dir, glob_pattern='*.pdf', verbose=True)
        count += 1

if __name__ == '__main__':
    # EXAMPLE USAGE:
    # Customize the following variables as needed
    collection_id = 'maharashtragr'          # Your target collection
    download_dir = './archive_downloads'  # Your download directory
    date_from = '2025'                    # Start year (or None)
    date_to = '2025'                      # End year (or None)
    topic_keyword = 'Water Resources Department'         # Topic/subject (or None)
    max_to_download = 500                  # Limit for demo purposes

    download_pdfs_from_collection(
        collection=collection_id,
        output_dir=download_dir,
        start_year=date_from,
        end_year=date_to,
        topic=topic_keyword,
        max_items=max_to_download
    )
