#!/usr/bin/env python3
"""
PDF Processing Script using Python
Converts PDF to Markdown format (best for RAG)
"""

import sys
import json
import os
from pathlib import Path

try:
    import pdfplumber
except ImportError as e:
    print(json.dumps({
        "error": f"Missing required Python package: pdfplumber - {str(e)}",
        "success": False
    }), file=sys.stderr)
    sys.exit(1)

try:
    import fitz  # PyMuPDF imports as 'fitz'
except ImportError:
    try:
        import PyMuPDF as fitz  # Try alternative import
    except ImportError as e:
        print(json.dumps({
            "error": f"Missing required Python package: PyMuPDF - {str(e)}",
            "success": False
        }), file=sys.stderr)
        sys.exit(1)

# camelot is optional - requires OpenGL libraries which may not be available on all servers
# If unavailable, we'll use pdfplumber for table extraction
camelot_available = False
try:
    import camelot
    camelot_available = True
except (ImportError, OSError) as e:
    # OSError can occur if OpenGL libraries (libGL.so.1) are missing
    print(f"Warning: camelot-py not available ({str(e)}), will use pdfplumber for table extraction", file=sys.stderr)
    camelot_available = False


def extract_text_with_pymupdf(pdf_path):
    """Extract text from PDF using PyMuPDF (better encoding handling)"""
    text_content = []
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(len(doc)):
            page = doc[page_num]
            # Extract text with proper encoding
            text = page.get_text("text")
            if text and text.strip():
                # Clean up the text
                text = text.strip()
                # Remove excessive whitespace but preserve line breaks
                lines = [line.strip() for line in text.split('\n') if line.strip()]
                text = '\n'.join(lines)
                text_content.append({
                    "page": page_num + 1,
                    "text": text
                })
        doc.close()
    except Exception as e:
        print(f"Error extracting text with PyMuPDF: {e}", file=sys.stderr)
    return text_content

def extract_text_with_pdfplumber(pdf_path):
    """Extract text from PDF using pdfplumber with better text ordering"""
    text_content = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                text = None
                
                # Method 1: Extract using words with proper ordering
                try:
                    words = page.extract_words()
                    if words:
                        # Sort words by Y position (top to bottom, higher Y first in PDF coordinates)
                        # Then by X position (left to right)
                        sorted_words = sorted(words, key=lambda w: (-round(w.get('top', 0), 1), w.get('x0', 0)))
                        # Group words into lines based on Y position
                        lines = []
                        current_line = []
                        last_y = None
                        
                        for word in sorted_words:
                            word_y = round(word.get('top', 0), 1)
                            word_text = word.get('text', '').strip()
                            
                            if not word_text:
                                continue
                            
                            # If Y position changed significantly, start new line
                            if last_y is not None and abs(word_y - last_y) > 2:
                                if current_line:
                                    lines.append(' '.join(current_line))
                                current_line = [word_text]
                            else:
                                current_line.append(word_text)
                            
                            last_y = word_y
                        
                        # Add last line
                        if current_line:
                            lines.append(' '.join(current_line))
                        
                        text = '\n'.join(lines)
                except Exception as e:
                    print(f"Error extracting words: {e}", file=sys.stderr)
                
                # Fallback to standard extraction
                if not text or len(text.strip()) < 10:
                    try:
                        text = page.extract_text()
                    except:
                        pass
                
                if text and text.strip():
                    # Clean up the text
                    text = text.strip()
                    # Remove excessive whitespace but preserve line breaks
                    lines = [line.strip() for line in text.split('\n') if line.strip()]
                    text = '\n'.join(lines)
                    text_content.append({
                        "page": page_num,
                        "text": text
                    })
    except Exception as e:
        print(f"Error extracting text with pdfplumber: {e}", file=sys.stderr)
    return text_content


def extract_tables_with_pdfplumber(pdf_path):
    """Extract tables from PDF using pdfplumber (fallback when camelot unavailable)"""
    tables = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                page_tables = page.extract_tables()
                for table in page_tables:
                    if table:
                        tables.append({
                            "page": page_num,
                            "data": table,
                            "accuracy": 70  # Lower accuracy for pdfplumber
                        })
    except Exception as e:
        print(f"Error extracting tables with pdfplumber: {e}", file=sys.stderr)
    return tables

def extract_tables_with_camelot(pdf_path):
    """Extract tables from PDF using camelot (if available), otherwise use pdfplumber"""
    tables = []
    
    # If camelot is not available, use pdfplumber directly
    if not camelot_available:
        return extract_tables_with_pdfplumber(pdf_path)
    
    try:
        # Try to extract tables from all pages using camelot
        table_list = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')
        
        for table in table_list:
            if table.parsing_report['accuracy'] > 80:  # Only include accurate tables
                table_data = {
                    "page": table.page,
                    "data": table.df.values.tolist(),  # Convert to list of lists
                    "accuracy": table.parsing_report['accuracy']
                }
                tables.append(table_data)
    except Exception as e:
        print(f"Error extracting tables with camelot: {e}, falling back to pdfplumber", file=sys.stderr)
        # Fallback to pdfplumber for tables
        return extract_tables_with_pdfplumber(pdf_path)
    
    return tables


def table_to_markdown(table_data):
    """Convert table data to Markdown format"""
    if not table_data or len(table_data) == 0:
        return ""
    
    markdown_lines = []
    for i, row in enumerate(table_data):
        # Ensure row is a list and handle None values
        if not isinstance(row, list):
            continue
        
        # Escape special Markdown characters in cells
        escaped_row = []
        for cell in row:
            if cell is None:
                escaped_row.append('')
            else:
                cell_str = str(cell).strip()
                # Escape pipe characters and newlines
                cell_str = cell_str.replace('|', '\\|').replace('\n', ' ').replace('\r', '')
                escaped_row.append(cell_str)
        
        if len(escaped_row) > 0:
            markdown_lines.append('| ' + ' | '.join(escaped_row) + ' |')
            
            # Add header separator after first row
            if i == 0:
                markdown_lines.append('| ' + ' | '.join(['---'] * len(escaped_row)) + ' |')
    
    return '\n'.join(markdown_lines) if markdown_lines else ""


def convert_pdf_to_markdown(pdf_path):
    """Convert PDF to Markdown format"""
    if not os.path.exists(pdf_path):
        return {
            "error": f"PDF file not found: {pdf_path}",
            "success": False,
            "markdown": None
        }
    
    try:
        markdown_content = []
        
        # Extract text - try PyMuPDF first (better encoding), fallback to pdfplumber
        text_content = extract_text_with_pymupdf(pdf_path)
        if not text_content or len(text_content) == 0:
            # Fallback to pdfplumber if PyMuPDF didn't extract anything
            text_content = extract_text_with_pdfplumber(pdf_path)
        
        # Extract tables
        tables = extract_tables_with_camelot(pdf_path)
        
        # Images are not extracted - skipped for Markdown output
        
        # Group content by page
        page_content = {}
        
        # Add text to pages
        for text_item in text_content:
            page_num = text_item["page"]
            if page_num not in page_content:
                page_content[page_num] = {"text": [], "tables": []}
            page_content[page_num]["text"].append(text_item["text"])
        
        # Add tables to pages
        for table in tables:
            page_num = table["page"]
            if page_num not in page_content:
                page_content[page_num] = {"text": [], "tables": []}
            page_content[page_num]["tables"].append(table["data"])
        
        # Build Markdown content
        for page_num in sorted(page_content.keys()):
            content = page_content[page_num]
            
            # Add page header
            markdown_content.append(f"\n# Page {page_num}\n")
            
            # Add text content
            if content["text"]:
                # Join all text for this page, preserving line structure
                combined_text = '\n\n'.join([t for t in content["text"] if t and t.strip()])
                if combined_text:
                    # Split into paragraphs (double newlines) or lines
                    paragraphs = combined_text.split('\n\n')
                    for para in paragraphs:
                        if para.strip():
                            # Clean up each paragraph
                            cleaned_para = ' '.join(para.split())
                            if cleaned_para:
                                markdown_content.append(f"{cleaned_para}\n\n")
            
            # Add tables
            for table_data in content["tables"]:
                if table_data:
                    markdown_content.append("\n")
                    markdown_table = table_to_markdown(table_data)
                    if markdown_table:
                        markdown_content.append(markdown_table)
                        markdown_content.append("\n")
            
            # Images are skipped - not included in Markdown output
        
        # Join all content
        markdown_text = '\n'.join(markdown_content)
        
        result = {
            "success": True,
            "pdf_path": pdf_path,
            "markdown": markdown_text,
            "summary": {
                "total_pages": len(page_content),
                "total_tables": len(tables),
                "total_images": 0  # Images not included
            }
        }
        
        return result
        
    except Exception as e:
        return {
            "error": f"Error processing PDF: {str(e)}",
            "success": False,
            "markdown": None
        }


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(json.dumps({
            "error": "Usage: python process_pdf.py <pdf_file_path> [output_markdown_path]",
            "success": False
        }), file=sys.stderr)
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    output_path = sys.argv[2] if len(sys.argv) > 2 else None
    
    result = convert_pdf_to_markdown(pdf_path)
    
    # If output path is provided and conversion was successful, write Markdown file
    if output_path and result.get("success") and result.get("markdown"):
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(result["markdown"])
            result["markdown_file"] = output_path
        except Exception as e:
            result["error"] = f"Failed to write Markdown file: {str(e)}"
            result["success"] = False
    
    # Output result as JSON (without full markdown content if file was written)
    output_result = result.copy()
    if output_path and "markdown_file" in output_result:
        # Don't include full markdown in JSON if file was written
        output_result["markdown"] = f"[Markdown content written to: {output_path}]"
    
    print(json.dumps(output_result, indent=2))
    
    sys.exit(0 if result.get("success") else 1)

