from aurelio_sdk import AurelioClientimport os# Set your API key as an environment variable# export AURELIO_API_KEY=your_api_key_here# Initialize the clientclient = AurelioClient(api_key=os.environ["AURELIO_API_KEY"])# Or use the async client for better performancefrom aurelio_sdk import AsyncAurelioClientasync_client = AsyncAurelioClient(api_key=os.environ["AURELIO_API_KEY"])
from aurelio_sdk import ExtractResponse# Local PDF fileresponse = client.extract_file( file_path="document.pdf", model="docling-base", # Higher accuracy model (replaces quality="high") chunk=True, # Automatically chunk the document wait=30 # Wait up to 30 seconds for processing)# Access the document ID for status checkingdocument_id = response.document.id# If the document is still processing, wait for completionif response.status != "complete": final_response = client.wait_for(document_id=document_id, wait=300)# Access the chunks once processing is completefor chunk in final_response.chunks: print(f"Chunk: {chunk.text[:100]}...")
For PDF URLs:
Copy
Ask AI
url_response = client.extract_url( url="https://arxiv.org/pdf/2305.10403.pdf", model="docling-base", # More accurate model for complex PDFs chunk=True, wait=30)
For video files (only supports aurelio-base model):
Copy
Ask AI
video_response = client.extract_file( file_path="video.mp4", model="aurelio-base", # Only supported model for video chunk=True, wait=-1, processing_options={ "chunking": { "chunker_type": "semantic" # Better chunking for video content } })
from aurelio_sdk import ChunkingOptions, ChunkResponse# Define chunking parameterschunking_options = ChunkingOptions( chunker_type="semantic", # Uses semantic chunking max_chunk_length=400, # Maximum token limit for one chunk window_size=5 # Rolling window context size)long_text = """Your long document text here..."""# Perform chunkingchunk_response = client.chunk( content=long_text, processing_options=chunking_options)# Process the chunksfor i, chunk in enumerate(chunk_response.chunks): print(f"Chunk {i+1}: {chunk.text[:50]}...")
from aurelio_sdk import EmbeddingResponse# Generate embeddings for a single textsingle_embedding = client.embedding( input="This is a sample text to embed", model="bm25" # Choose your embedding model)# Generate embeddings for multiple texts (batch processing)texts = [ "First document to embed", "Second document to embed", "Third document to embed"]batch_embeddings = client.embedding( input=texts)# Access the embedding vectorsvectors = batch_embeddings.data
# 1. Extract and chunk a PDFextract_response = client.extract_file( file_path="research_paper.pdf", model="docling-base", chunk=True, wait=60)# Wait for completion if neededif extract_response.status != "complete": extract_response = client.wait_for(document_id=extract_response.document.id, wait=300)# 2. Get all chunk textschunk_texts = [chunk.text for chunk in extract_response.chunks]# 3. Generate embeddings for all chunksembedding_response = client.embedding(input=chunk_texts)# 4. Now you have vectorized your PDF document# Each vector corresponds to a chunk from the original documentvectors = embedding_response.data