# FileGraph API > Document conversion and processing API. Extract text from any document, convert between formats, and manipulate PDFs. Base URL: https://api.filegraph.ai ## Authentication All endpoints require an API key in the `X-API-Key` header: ``` X-API-Key: your-api-key ``` Get your API key at https://filegraph.ai/dashboard ## Quick Start ```python httpx.post("https://api.filegraph.ai/any/to-text", headers={ "X-API-Key": "YOUR_API_KEY" }, params={ "ocr_language": "eng", "ocr_fallback": "true" }, files=[ ("file", open("document.pdf", "rb")) ] ) ``` # --- Production Usage --- from __future__ import annotations from pathlib import Path from typing import Any import time import random import httpx from pydantic import BaseModel, Field, ConfigDict, field_validator class ExtractTextResponse(BaseModel): """ Response model for the /any/to-text endpoint. """ text: str = Field(description="Extracted text content") format_detected: str = Field(description="Detected input format (e.g., 'pdf', 'docx', 'image')") word_count: int = Field(description="Number of words extracted") char_count: int = Field(description="Number of characters extracted") used_ocr: bool = Field(description="Whether OCR was used for extraction") metadata: dict[str, Any] | None = Field(default=None, description="Additional format-specific metadata") model_config = ConfigDict(strict=True) class ExtractTextRequest(BaseModel): """ Request model for the /any/to-text endpoint. Parameters: - ocr_language: Language code for OCR processing (default 'eng') - ocr_fallback: Whether to fallback to OCR if text extraction fails (default True) - file: Path to the document file to extract text from (must exist) """ ocr_language: str = Field(default="eng", description="Language code for OCR processing") ocr_fallback: bool = Field(default=True, description="Whether to fallback to OCR if text extraction fails") file: Path = Field(description="Path to the document file to extract text from") model_config = ConfigDict(strict=True) @field_validator("file") @classmethod def check_file_exists(cls, v: Path) -> Path: if not v.is_file(): raise ValueError(f"File does not exist: {v}") return v class FileGraphClient: """ Client for interacting with the FileGraph API. Usage: with FileGraphClient(api_key) as client: response = client.extract_text(request_model) """ BASE_URL = "https://api.filegraph.ai" TIMEOUT_DEFAULT = 30.0 TIMEOUT_LARGE_FILE = 120.0 def __init__( self, api_key: str, timeout: float | None = None, ) -> None: self._api_key = api_key self._timeout = timeout or self.TIMEOUT_DEFAULT self._client: httpx.Client | None = None def __enter__(self) -> FileGraphClient: limits = httpx.Limits(max_connections=10, max_keepalive_connections=5) self._client = httpx.Client( base_url=self.BASE_URL, headers={"X-API-Key": self._api_key}, timeout=self._timeout, limits=limits, ) return self def __exit__(self, exc_type, exc_val, exc_tb) -> None: if self._client is not None: self._client.close() self._client = None def _retry_request( self, method: str, url: str, max_retries: int = 3, **kwargs, ) -> httpx.Response: """Execute request with automatic retry on 429 rate limit errors.""" assert self._client is not None, "HTTP client not initialized" for attempt in range(max_retries + 1): response = self._client.request(method, url, **kwargs) if response.status_code == 429: if attempt == max_retries: response.raise_for_status() # Give up after max retries retry_after = response.headers.get("Retry-After") if retry_after and retry_after.isdigit(): wait_time = int(retry_after) else: wait_time = (2 ** attempt) + random.uniform(0, 1) # Exponential backoff with jitter wait_time = min(wait_time, 60) # Cap at 60 seconds time.sleep(wait_time) continue return response return response # Should not reach here def extract_text(self, request: ExtractTextRequest) -> ExtractTextResponse: """ Extract text from a document file using the /any/to-text endpoint. Args: request: ExtractTextRequest instance with parameters and file path. Returns: ExtractTextResponse instance with extracted text and metadata. Raises: httpx.HTTPStatusError: On non-success HTTP responses. """ assert self._client is not None, "Client is not initialized. Use 'with' statement." params = { "ocr_language": request.ocr_language, "ocr_fallback": str(request.ocr_fallback).lower(), } files = { "file": (request.file.name, request.file.open("rb"), "application/octet-stream"), } try: response = self._retry_request( method="POST", url="/any/to-text", params=params, files=files, timeout=self._timeout, ) response.raise_for_status() except httpx.HTTPStatusError as exc: trace_id = "unknown" try: trace_id = exc.response.json().get("trace_id", "unknown") except Exception: pass raise httpx.HTTPStatusError( f"HTTP error {exc.response.status_code} for /any/to-text (trace_id={trace_id}): {exc.response.text}", request=exc.request, response=exc.response, ) from exc finally: # Close the file explicitly files["file"][1].close() data = response.json() return ExtractTextResponse.model_validate(data) # Example: # with FileGraphClient("your-api-key") as fg: # result = fg.extract_text(ExtractTextRequest(file=Path("invoice.pdf"))) # print(result.text) ``` ## Text Extraction Extract text from any document format. The smart /any/to-text endpoint auto-detects format and uses OCR for scanned documents. ### POST /any/to-text Auto-detects file format (PDF, DOCX, XLSX, PPTX, images, HTML) and extracts text. Uses OCR automatically for scanned documents and images. **Parameters:** - `file` (file): The document file to extract text from - `ocr_language` (string (optional), default: eng): Language for OCR (ISO 639-3 code) - `ocr_fallback` (boolean (optional), default: true): Use OCR if no text found **Response:** ```json { "text": "Extracted text content...", "format_detected": "pdf", "word_count": 150, "char_count": 892, "used_ocr": false, "metadata": {"page_count": 3} } ``` ### POST /any/info Returns file metadata without processing content. Includes format, MIME type, page count, dimensions, etc. **Parameters:** - `file` (file): The file to analyze **Response:** ```json { "filename": "document.pdf", "mime_type": "application/pdf", "format_detected": "pdf", "size_bytes": 102400, "metadata": {"page_count": 5} } ``` ## PDF Operations Manipulate PDF files: merge, split, compress, rotate, and manage password protection. ### POST /pdf/to-text Extract text from PDF with optional OCR fallback for scanned pages. **Parameters:** - `file` (file): PDF file to extract text from - `ocr_fallback` (boolean (optional), default: true): Use OCR if no text found - `ocr_language` (string (optional), default: eng): Language for OCR ### POST /pdf/merge Combine multiple PDF files into a single document. Files are merged in the order provided. **Parameters:** - `files` (file[]): PDF files to merge (in order) **Response:** ```json { "content_base64": "JVBERi0xLjQK...", "filename": "merged.pdf", "total_pages": 10, "input_count": 3 } ``` ### POST /pdf/split Split a PDF into multiple files based on page ranges. Examples: '1-5', '1,3,5', '1-5,10-15', '10-' (page 10 to end). **Parameters:** - `file` (file): PDF file to split - `pages` (string): Page ranges (e.g., '1-5,10-15') ### POST /pdf/compress Reduce PDF file size. Higher compression may reduce quality. **Parameters:** - `file` (file): PDF file to compress - `level` (string (optional), default: medium): Compression level: low, medium, high **Response:** ```json { "content_base64": "JVBERi0xLjQK...", "filename": "compressed.pdf", "input_size_bytes": 1024000, "output_size_bytes": 512000, "compression_ratio": 0.5 } ``` ### POST /pdf/rotate Rotate pages in a PDF by 90, 180, or 270 degrees. **Parameters:** - `file` (file): PDF file to rotate - `angle` (integer (optional), default: 90): Rotation angle: 90, 180, or 270 - `pages` (string (optional)): Page range to rotate (default: all) ### POST /pdf/encrypt Add password protection to a PDF. User password is required to open; owner password grants full permissions. **Parameters:** - `file` (file): PDF file to encrypt - `user_password` (string): Password to open the PDF - `owner_password` (string (optional)): Password for full permissions (defaults to user_password) ### POST /pdf/decrypt Remove password protection from a PDF. Requires the correct password. **Parameters:** - `file` (file): Password-protected PDF - `password` (string): Password to unlock the PDF ## Image & OCR Extract text from images using OCR. Supports PNG, JPEG, TIFF, BMP, GIF, WebP. ### POST /image/to-text Use OCR to extract text from images. Supports multiple languages. **Parameters:** - `file` (file): Image file (PNG, JPEG, TIFF, BMP, GIF, WebP) - `language` (string (optional), default: eng): OCR language code (e.g., eng, fra, deu, spa, chi_sim, jpn) **Response:** ```json { "text": "Text extracted from image...", "word_count": 50, "char_count": 280, "used_ocr": true } ``` ## Office Documents Extract text from Microsoft Office documents: Word (DOCX), Excel (XLSX), PowerPoint (PPTX). ### POST /docx/to-text Extract text from DOCX files including headers, footers, and tables. **Parameters:** - `file` (file): Word document (.docx) ### POST /xlsx/to-text Extract data from Excel files. Can select specific sheets. **Parameters:** - `file` (file): Excel spreadsheet (.xlsx) - `sheets` (string (optional)): Sheet names or indices to extract (default: all) ### POST /pptx/to-text Extract text from all slides including speaker notes. **Parameters:** - `file` (file): PowerPoint presentation (.pptx) ## HTML Processing Extract text from HTML content, removing tags and scripts. ### POST /html/to-text Extract visible text content from HTML, removing all tags and scripts. **Parameters:** - `file` (file): HTML file ## Response Format All text extraction endpoints return JSON with this structure: ```json { "text": "Extracted text content...", "format_detected": "pdf", "word_count": 150, "char_count": 892, "used_ocr": false, "metadata": {} } ``` ## Error Handling Errors return standard HTTP status codes: - `400` - Bad request (unsupported format, invalid parameters) - `401` - Invalid or missing API key - `413` - File too large - `422` - Processing failed - `500` - Internal server error