from flask import Flask, render_template, request, jsonify, send_file
from werkzeug.utils import secure_filename
import pandas as pd
import os
import uuid
import re
from datetime import datetime

# Try to import PyPDF2, if not available show error
try:
    import PyPDF2
    PDF_SUPPORT = True
except ImportError:
    PDF_SUPPORT = False
    print("⚠️ PyPDF2 not installed. Run: pip install PyPDF2")

app = Flask(__name__)

# Configuration
UPLOAD_FOLDER = 'uploads'
EXCEL_FOLDER = 'excel_files'

os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(EXCEL_FOLDER, exist_ok=True)

app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024  # 50MB

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() == 'pdf'

def extract_text_from_pdf(pdf_path):
    """Extract all text content from PDF file"""
    if not PDF_SUPPORT:
        return "PyPDF2 not installed. Please run: pip install PyPDF2"
    
    text_content = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text_content += page_text + "\n\n"
    except Exception as e:
        text_content = f"Error extracting text: {str(e)}"
    return text_content if text_content else "No text could be extracted from this PDF."

def extract_data_from_text(text):
    """Extract useful information from PDF text content"""
    data = {
        'Extracted Text Preview': text[:1000] + "..." if len(text) > 1000 else text,
        'Full Text Length': len(text),
        'Word Count': len(text.split()),
        'Line Count': len(text.split('\n')),
        'Has Numbers': 'Yes' if re.search(r'\d', text) else 'No',
        'Has Emails': 'Yes' if re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text) else 'No',
        'Has URLs': 'Yes' if re.search(r'https?://\S+', text) else 'No'
    }
    
    # Extract email addresses
    emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    data['Emails Found'] = ', '.join(list(set(emails))[:10]) if emails else 'None'
    data['Email Count'] = len(set(emails))
    
    # Extract phone numbers
    phone_patterns = [
        r'\+\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',
        r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',
        r'\(\d{3}\)\s?\d{3}[-.\s]?\d{4}',
        r'\d{4}[-.\s]?\d{7}',
    ]
    phones = []
    for pattern in phone_patterns:
        phones.extend(re.findall(pattern, text))
    data['Phone Numbers Found'] = ', '.join(list(set(phones))[:10]) if phones else 'None'
    data['Phone Count'] = len(set(phones))
    
    # Extract numbers/amounts
    amounts = re.findall(r'\$\d+(?:,\d+)*(?:\.\d+)?|\d+(?:\.\d+)?\s?(?:USD|EUR|INR|PKR|Rupees|Rs\.?)', text, re.IGNORECASE)
    data['Amounts Found'] = ', '.join(amounts[:10]) if amounts else 'None'
    
    return data

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/convert-pdf-to-excel', methods=['POST'])
def convert_pdf_to_excel():
    """Convert uploaded PDF to Excel file with extracted data"""
    try:
        # Check if file is uploaded
        if 'pdf_file' not in request.files:
            return jsonify({
                'success': False,
                'message': 'No file uploaded! Please select a PDF file.'
            }), 400
        
        uploaded_file = request.files['pdf_file']
        
        if uploaded_file.filename == '':
            return jsonify({
                'success': False,
                'message': 'No file selected! Please choose a PDF file.'
            }), 400
        
        if not allowed_file(uploaded_file.filename):
            return jsonify({
                'success': False,
                'message': 'Only PDF files are allowed! Please upload a .pdf file.'
            }), 400
        
        # Save PDF file
        original_filename = secure_filename(uploaded_file.filename)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        unique_id = str(uuid.uuid4())[:6]
        pdf_filename = f"{timestamp}_{unique_id}_{original_filename}"
        pdf_path = os.path.join(app.config['UPLOAD_FOLDER'], pdf_filename)
        uploaded_file.save(pdf_path)
        
        # Extract text from PDF
        extracted_text = extract_text_from_pdf(pdf_path)
        
        # Extract data from text
        extracted_data = extract_data_from_text(extracted_text)
        
        # Create Excel file with multiple sheets
        excel_filename = f"PDF_Data_{timestamp}_{unique_id}.xlsx"
        excel_path = os.path.join(EXCEL_FOLDER, excel_filename)
        
        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
            # Sheet 1: File Information
            df_file = pd.DataFrame([
                {
                    'Original Filename': original_filename,
                    'Saved Filename': pdf_filename,
                    'File Size (KB)': round(os.path.getsize(pdf_path) / 1024, 2),
                    'Conversion Date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    'Conversion ID': unique_id.upper()
                }
            ])
            df_file.to_excel(writer, sheet_name='File Information', index=False)
            
            # Sheet 2: Statistics
            df_stats = pd.DataFrame([
                {
                    'Total Characters': extracted_data['Full Text Length'],
                    'Total Words': extracted_data['Word Count'],
                    'Total Lines': extracted_data['Line Count'],
                    'Contains Numbers': extracted_data['Has Numbers'],
                    'Contains Emails': extracted_data['Has Emails'],
                    'Contains URLs': extracted_data['Has URLs'],
                    'Email Count': extracted_data['Email Count'],
                    'Phone Count': extracted_data['Phone Count']
                }
            ])
            df_stats.to_excel(writer, sheet_name='Statistics', index=False)
            
            # Sheet 3: Extracted Information
            df_extracted = pd.DataFrame([
                {
                    'Email Addresses Found': extracted_data['Emails Found'],
                    'Phone Numbers Found': extracted_data['Phone Numbers Found'],
                    'Amounts/Values Found': extracted_data['Amounts Found']
                }
            ])
            df_extracted.to_excel(writer, sheet_name='Extracted Information', index=False)
            
            # Sheet 4: Full Text (split into chunks)
            text_chunks = [extracted_text[i:i+30000] for i in range(0, len(extracted_text), 30000)]
            df_text = pd.DataFrame({
                'Chunk Number': range(1, len(text_chunks) + 1),
                'Text Content': text_chunks
            }) if text_chunks else pd.DataFrame({'Text Content': ['No text extracted']})
            df_text.to_excel(writer, sheet_name='Full Extracted Text', index=False)
            
            # Sheet 5: Preview
            df_preview = pd.DataFrame([
                {'Preview (First 1000 chars)': extracted_data['Extracted Text Preview']}
            ])
            df_preview.to_excel(writer, sheet_name='Text Preview', index=False)
        
        # Return the Excel file
        return send_file(
            excel_path,
            as_attachment=True,
            download_name=f"PDF_to_Excel_{original_filename.replace('.pdf', '')}_{timestamp}.xlsx",
            mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return jsonify({
            'success': False,
            'message': f'Conversion failed: {str(e)}'
        }), 500

@app.route('/preview-pdf', methods=['POST'])
def preview_pdf():
    """Get preview data from PDF without downloading Excel"""
    try:
        if 'pdf_file' not in request.files:
            return jsonify({'success': False, 'message': 'No file uploaded'}), 400
        
        uploaded_file = request.files['pdf_file']
        
        if uploaded_file.filename == '' or not allowed_file(uploaded_file.filename):
            return jsonify({'success': False, 'message': 'Invalid PDF file'}), 400
        
        original_filename = secure_filename(uploaded_file.filename)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        unique_id = str(uuid.uuid4())[:6]
        pdf_filename = f"{timestamp}_{unique_id}_{original_filename}"
        pdf_path = os.path.join(app.config['UPLOAD_FOLDER'], pdf_filename)
        uploaded_file.save(pdf_path)
        
        extracted_text = extract_text_from_pdf(pdf_path)
        extracted_data = extract_data_from_text(extracted_text)
        
        # Clean up temp file after preview
        try:
            os.remove(pdf_path)
        except:
            pass
        
        return jsonify({
            'success': True,
            'message': 'PDF analyzed successfully!',
            'filename': original_filename,
            'stats': {
                'characters': extracted_data['Full Text Length'],
                'words': extracted_data['Word Count'],
                'lines': extracted_data['Line Count'],
                'emails': extracted_data['Email Count'],
                'phones': extracted_data['Phone Count']
            },
            'found_data': {
                'emails': extracted_data['Emails Found'],
                'phones': extracted_data['Phone Numbers Found'],
                'amounts': extracted_data['Amounts Found']
            },
            'preview': extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text
        })
        
    except Exception as e:
        print(f"Preview error: {str(e)}")
        return jsonify({'success': False, 'message': str(e)}), 500

if __name__ == '__main__':
    print("\n" + "="*60)
    print("📄 PDF TO EXCEL CONVERTER")
    print("📍 User uploads PDF -> Click Convert -> Get Excel file")
    print("🌐 Open in browser: http://127.0.0.1:5000")
    print("="*60 + "\n")
    app.run(debug=True, host='127.0.0.1', port=5000)