try:
    import pandas as pd
except ImportError:
    print("❌ pandas library not found!")
    print("📦 Installing pandas...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "openpyxl"])
    import pandas as pd
    print("✅ pandas installed successfully!")

import os

def clean_dataframe(df, file_name):
    """
    حذف الصفوف التي تحتوي على 'Total records in this page' وكل الصفوف التي تحتها
    """
    print(f"🧹 Cleaning {file_name} - Looking for 'Total records in this page'...")

    # البحث عن الصف الذي يحتوي على "Total records in this page" في العمود الأول
    total_records_row = None

    for index, row in df.iterrows():
        # التحقق من العمود الأول (index 0)
        if pd.notna(row.iloc[0]) and "Total records in this page" in str(row.iloc[0]):
            total_records_row = index
            print(f"   ⚠️  Found 'Total records in this page' at row {index}")
            break

    if total_records_row is not None:
        # حذف الصف المحدد وكل الصفوف التي تحته
        original_length = len(df)
        df_cleaned = df.iloc[:total_records_row].copy()
        removed_rows = original_length - len(df_cleaned)
        print(f"   ✅ Removed {removed_rows} rows (from row {total_records_row} onwards)")
        return df_cleaned
    else:
        print(f"   ✅ No 'Total records in this page' found - no cleaning needed")
        return df

def remove_unwanted_columns(df):
    """
    حذف الأعمدة غير المرغوب فيها من البيانات المدموجة
    """
    print("🗑️  Removing unwanted columns...")

    # قائمة الأعمدة المراد حذفها
    columns_to_remove = [
        "Offers Count",
        "Offer Accepted Count",
        "Offer Declined Count",
        "Hired Count",
        "Joined Count",
        "Associated Jobs",
        "Requester",
        "Approvers",
        "Requisition Approved Date",
        "Requisition Active Date",
        "Requisition Rejected Date",
        "Requisition Reject Reason",
        "Designation / Tittle",
        "Priority",
        "Location",
        "Cost Center",
        "Grade",
        "Employment type",
        "Business Unit Shift",
        "Commitment hours",
        "Additional Info"
    ]

    # التحقق من الأعمدة الموجودة والتي يمكن حذفها
    existing_columns = list(df.columns)
    columns_found = []
    columns_not_found = []

    for col in columns_to_remove:
        if col in existing_columns:
            columns_found.append(col)
        else:
            columns_not_found.append(col)

    # حذف الأعمدة الموجودة
    if columns_found:
        df_cleaned = df.drop(columns=columns_found)
        print(f"   ✅ Removed {len(columns_found)} columns:")
        for col in columns_found:
            print(f"      - {col}")
    else:
        df_cleaned = df
        print("   ⚠️  No matching columns found to remove")

    # عرض الأعمدة التي لم يتم العثور عليها
    if columns_not_found:
        print(f"   ⚠️  {len(columns_not_found)} columns not found in data:")
        for col in columns_not_found:
            print(f"      - {col}")

    print(f"   📊 Remaining columns: {len(df_cleaned.columns)}")
    return df_cleaned

def add_on_time_column(df):
    """
    إضافة عمود "On Time" بناءً على مقارنة Requisition Completed Date مع Target Date
    """
    print("➕ Adding 'On Time' column...")

    # التحقق من وجود الأعمدة المطلوبة
    required_columns = ["Requisition Completed Date", "Target Date"]
    missing_columns = []

    for col in required_columns:
        if col not in df.columns:
            missing_columns.append(col)

    if missing_columns:
        print(f"   ❌ Missing required columns: {missing_columns}")
        print("   ⚠️  Cannot add 'On Time' column")
        return df

    print("   ✅ Required columns found")

    # إنشاء العمود الجديد
    def determine_on_time(row):
        completed_date = row["Requisition Completed Date"]
        target_date = row["Target Date"]

        # إذا كان Requisition Completed Date فارغ
        if pd.isna(completed_date) or completed_date is None:
            return "Still Active"

        # إذا كان Target Date فارغ
        if pd.isna(target_date) or target_date is None:
            return "Unknown"  # حالة احتياطية

        # تحويل التواريخ للمقارنة
        try:
            if isinstance(completed_date, str):
                completed_date = pd.to_datetime(completed_date)
            if isinstance(target_date, str):
                target_date = pd.to_datetime(target_date)

            # المقارنة
            if completed_date > target_date:
                return "No"
            else:
                return "Yes"
        except:
            return "Error"  # في حالة خطأ في تحويل التاريخ

    # تطبيق الدالة على كل صف
    df["On Time"] = df.apply(determine_on_time, axis=1)

    # إحصائيات النتائج
    on_time_counts = df["On Time"].value_counts()
    print("   📊 On Time column statistics:")
    for value, count in on_time_counts.items():
        print(f"      - {value}: {count} records")

    print(f"   ✅ 'On Time' column added successfully")
    return df

def add_date_columns(df):
    """
    إضافة أعمدة الشهر والسنة واليوم من عمود Target Date
    """
    print("📅 Adding date columns (Month, Year, Day) from Target Date...")

    # التحقق من وجود عمود Target Date
    if "Target Date" not in df.columns:
        print("   ❌ 'Target Date' column not found")
        print("   ⚠️  Cannot add date columns")
        return df

    print("   ✅ 'Target Date' column found")

    # قاموس أسماء الشهور بالإنجليزية
    month_names = {
        1: "January", 2: "February", 3: "March", 4: "April",
        5: "May", 6: "June", 7: "July", 8: "August",
        9: "September", 10: "October", 11: "November", 12: "December"
    }

    # دالة لاستخراج الشهر
    def get_month_name(date_value):
        if pd.isna(date_value) or date_value is None:
            return "Unknown"
        try:
            if isinstance(date_value, str):
                date_value = pd.to_datetime(date_value)
            return month_names.get(date_value.month, "Unknown")
        except:
            return "Error"

    # دالة لاستخراج السنة
    def get_year(date_value):
        if pd.isna(date_value) or date_value is None:
            return "Unknown"
        try:
            if isinstance(date_value, str):
                date_value = pd.to_datetime(date_value)
            return str(date_value.year)
        except:
            return "Error"

    # دالة لاستخراج اليوم
    def get_day(date_value):
        if pd.isna(date_value) or date_value is None:
            return "Unknown"
        try:
            if isinstance(date_value, str):
                date_value = pd.to_datetime(date_value)
            return str(date_value.day)
        except:
            return "Error"

    # إضافة الأعمدة الجديدة
    df["Target Month"] = df["Target Date"].apply(get_month_name)
    df["Target Year"] = df["Target Date"].apply(get_year)
    df["Target Day"] = df["Target Date"].apply(get_day)

    # إحصائيات النتائج
    print("   📊 Date columns statistics:")

    # إحصائيات الشهور
    month_counts = df["Target Month"].value_counts()
    print("      Months:")
    for month, count in month_counts.head(5).items():  # أول 5 شهور
        print(f"         - {month}: {count} records")
    if len(month_counts) > 5:
        print(f"         - ... and {len(month_counts) - 5} more months")

    # إحصائيات السنوات
    year_counts = df["Target Year"].value_counts()
    print("      Years:")
    for year, count in year_counts.items():
        print(f"         - {year}: {count} records")

    print(f"   ✅ Date columns added successfully")
    return df

def add_total_active_column(df):
    """
    إضافة عمود "Total Active" - 1 إذا كان Requisition Completed Date فارغ، 0 إذا لم يكن فارغ
    """
    print("🔢 Adding 'Total Active' column...")

    # التحقق من وجود عمود Requisition Completed Date
    if "Requisition Completed Date" not in df.columns:
        print("   ❌ 'Requisition Completed Date' column not found")
        print("   ⚠️  Cannot add 'Total Active' column")
        return df

    print("   ✅ 'Requisition Completed Date' column found")

    # دالة لتحديد قيمة Total Active
    def determine_total_active(completed_date):
        # إذا كان التاريخ فارغ أو null
        if pd.isna(completed_date) or completed_date is None:
            return "1"  # نشط
        else:
            return "0"  # مكتمل

    # إضافة العمود الجديد
    df["Total Active"] = df["Requisition Completed Date"].apply(determine_total_active)

    # إحصائيات النتائج
    active_counts = df["Total Active"].value_counts()
    print("   📊 Total Active column statistics:")
    for value, count in active_counts.items():
        status = "Active" if value == "1" else "Completed"
        print(f"      - {value} ({status}): {count} records")

    print(f"   ✅ 'Total Active' column added successfully")
    return df

def add_total_on_time_column(df):
    """
    إضافة عمود "Total On Time" - 1 إذا كان On Time = "Yes"، 0 لأي شيء آخر
    """
    print("⏰ Adding 'Total On Time' column...")

    # التحقق من وجود عمود On Time
    if "On Time" not in df.columns:
        print("   ❌ 'On Time' column not found")
        print("   ⚠️  Cannot add 'Total On Time' column")
        return df

    print("   ✅ 'On Time' column found")

    # دالة لتحديد قيمة Total On Time
    def determine_total_on_time(on_time_value):
        # إذا كانت القيمة "Yes" (مع تجاهل حالة الأحرف)
        if str(on_time_value).strip().lower() == "yes":
            return "1"  # في الوقت المحدد
        else:
            return "0"  # ليس في الوقت المحدد أو حالة أخرى

    # إضافة العمود الجديد
    df["Total On Time"] = df["On Time"].apply(determine_total_on_time)

    # إحصائيات النتائج
    on_time_counts = df["Total On Time"].value_counts()
    print("   📊 Total On Time column statistics:")
    for value, count in on_time_counts.items():
        status = "On Time" if value == "1" else "Not On Time"
        print(f"      - {value} ({status}): {count} records")

    # إحصائيات تفصيلية من العمود الأصلي
    original_counts = df["On Time"].value_counts()
    print("   📊 Original 'On Time' values breakdown:")
    for value, count in original_counts.items():
        converted = "1" if str(value).strip().lower() == "yes" else "0"
        print(f"      - '{value}' → {converted}: {count} records")

    print(f"   ✅ 'Total On Time' column added successfully")
    return df

def add_region_column(df):
    """
    إضافة عمود "Region" بناءً على محتوى عمود "Department"
    """
    print("🌍 Adding 'Region' column based on Department...")

    # التحقق من وجود عمود Department
    if "Department" not in df.columns:
        print("   ❌ 'Department' column not found")
        print("   ⚠️  Cannot add 'Region' column")
        return df

    print("   ✅ 'Department' column found")

    # قوائم اللغات لكل منطقة
    asian_languages = [
        "Hindi", "Urdu", "Bengali", "Tagalog", "Burmese", "Vietnamese", "Pashto", "Dari", "Farsi",
        "Nepali", "Ilocano", "Telugu", "Tedim", "Cebuano", "Punjabi", "Cantonese", "Korean",
        "Mandarin", "Kurdish", "Marathi", "Karen", "Hakha (Chin)", "Chuukese", "Tamil", "Cambodian",
        "Laotian", "Malay", "Marshallese", "Tibetan", "Fuzhou", "Chinese", "Falam", "Krahn",
        "Sinhalese", "Hebrew", "Hmong", "Thai", "Hakka (Chinese)", "Gujarati", "Foochow",
        "Fukienese", "Japanese", "Hakha Chin", "Tetum", "Mongolian"
    ]

    european_languages = [
        "Turkish", "Romanian", "Uzbek", "Russian", "Ukrainian", "Albanian", "Lithuanian", "Polish",
        "Serbian", "Bosnian", "Croatian", "Greek", "Bulgarian", "Armenian", "Hungarian", "Macedonian",
        "Dutch", "Norwegian", "Czech", "Finnish", "Danish", "Slovak", "Swedish", "Kazakh",
        "Estonian", "Georgian"
    ]

    latin_languages = [
        "Spanish", "Portuguese", "Haitian Creole", "Quiche", "Quechua"
    ]

    middle_east_africa_languages = [
        "Arabic", "Swahili", "Somali", "Kinyarwanda", "Kirundi", "Oromo", "Amharic", "Tigrinya",
        "Luganda", "Lingala", "Yoruba", "Krio", "Pidgin", "Hausa", "Ibo", "Afrikaans", "Wolof",
        "Bambara", "Berber", "Zulu", "Kikuyu", "Cape Verdean Creole", "Fulani", "Mam", "Nuer",
        "Dinka", "Herrero"
    ]

    local_market_languages = [
        "French", "Italian", "German"
    ]

    def determine_region(department_value):
        if pd.isna(department_value) or department_value is None:
            return None

        department_str = str(department_value)

        # التحقق من اللغات الآسيوية
        for lang in asian_languages:
            if lang.lower() in department_str.lower():
                return "Asian"

        # التحقق من اللغات الأوروبية
        for lang in european_languages:
            if lang.lower() in department_str.lower():
                return "Europe"

        # التحقق من اللغات اللاتينية
        for lang in latin_languages:
            if lang.lower() in department_str.lower():
                return "Latin"

        # التحقق من لغات الشرق الأوسط وأفريقيا
        for lang in middle_east_africa_languages:
            if lang.lower() in department_str.lower():
                return "Middle East & Africa"

        # التحقق من لغات السوق المحلي
        for lang in local_market_languages:
            if lang.lower() in department_str.lower():
                return "Local Market"

        # إذا لم يتم العثور على أي تطابق
        return None

    # إضافة العمود الجديد
    df["Region"] = df["Department"].apply(determine_region)

    # إحصائيات النتائج
    region_counts = df["Region"].value_counts(dropna=False)
    print("   📊 Region column statistics:")
    for region, count in region_counts.items():
        region_name = "Unknown/No Match" if pd.isna(region) else region
        print(f"      - {region_name}: {count} records")

    print(f"   ✅ 'Region' column added successfully")
    return df

def add_ta_spv_column(df):
    """
    إضافة عمود "TA SPV" بناءً على عمود "Region"
    """
    print("👤 Adding 'TA SPV' column based on Region...")

    # التحقق من وجود عمود Region
    if "Region" not in df.columns:
        print("   ❌ 'Region' column not found")
        print("   ⚠️  Cannot add 'TA SPV' column")
        return df

    print("   ✅ 'Region' column found")

    # دالة لتحديد TA SPV بناءً على المنطقة
    def determine_ta_spv(region_value):
        if pd.isna(region_value) or region_value is None:
            return None

        region_str = str(region_value).strip()

        if region_str == "Asian":
            return "Mahmoud Abdelrahman"
        elif region_str == "Europe":
            return "Aya Mohamed"
        elif region_str == "Latin":
            return "Habiba Saad"
        elif region_str == "Middle East & Africa":
            return "Sohaila Maher"
        elif region_str == "Local Market":
            return "Sohaila Maher"
        else:
            return None

    # إضافة العمود الجديد
    df["TA SPV"] = df["Region"].apply(determine_ta_spv)

    # إحصائيات النتائج
    spv_counts = df["TA SPV"].value_counts(dropna=False)
    print("   📊 TA SPV column statistics:")
    for spv, count in spv_counts.items():
        spv_name = "Unknown/No Assignment" if pd.isna(spv) else spv
        print(f"      - {spv_name}: {count} records")

    # إحصائيات تفصيلية حسب المنطقة
    region_spv_mapping = df.groupby(['Region', 'TA SPV']).size().reset_index(name='count')
    if not region_spv_mapping.empty:
        print("   📊 Region to TA SPV mapping:")
        for _, row in region_spv_mapping.iterrows():
            region = "Unknown" if pd.isna(row['Region']) else row['Region']
            spv = "No Assignment" if pd.isna(row['TA SPV']) else row['TA SPV']
            print(f"      - {region} → {spv}: {row['count']} records")

    print(f"   ✅ 'TA SPV' column added successfully")
    return df



def merge_csv_files():
    """
    دمج ملفي Active Requisitions و Completed Requisitions في ملف CSV واحد
    """
    # مسار مجلد الملفات
    files_directory = r"C:\Users\Mostafa Rizk\Desktop\zoho\File"
    
    # أسماء الملفات
    active_file = "Active Requisitions Report.xls"
    completed_file = "Completed Requisitions.xls"
    
    # مسارات الملفات الكاملة
    active_path = os.path.join(files_directory, active_file)
    completed_path = os.path.join(files_directory, completed_file)
    
    try:
        print("🔄 Starting file merge process...")
        
        # التحقق من وجود الملفات
        if not os.path.exists(active_path):
            print(f"❌ File not found: {active_file}")
            return False
            
        if not os.path.exists(completed_path):
            print(f"❌ File not found: {completed_file}")
            return False
        
        print(f"📖 Reading {active_file} to get column headers from row 2...")
        # قراءة ملف Active Requisitions للحصول على رؤوس الأعمدة من الصف الثاني
        try:
            active_df_temp = pd.read_excel(active_path, header=None)
            # أخذ رؤوس الأعمدة من الصف الثاني (index 1)
            active_headers = list(active_df_temp.iloc[1])
            # أخذ البيانات من الصف الثالث فما فوق (index 2+)
            active_data_df = active_df_temp.iloc[2:].reset_index(drop=True)
            active_data_df.columns = active_headers

            print(f"✅ Successfully read {active_file} - {len(active_data_df)} data rows")
            print(f"📋 Column headers from Active file (row 2): {active_headers}")
        except Exception as e:
            print(f"❌ Error reading {active_file}: {str(e)}")
            return False

        print(f"📖 Reading {completed_file} data only (skipping first 2 rows)...")
        # قراءة ملف Completed Requisitions بدون رؤوس أعمدة (تخطي أول صفين)
        try:
            completed_df = pd.read_excel(completed_path, header=None, skiprows=2)
            print(f"✅ Successfully read {completed_file} - {len(completed_df)} rows (first 2 rows skipped)")
        except Exception as e:
            print(f"❌ Error reading {completed_file}: {str(e)}")
            return False

        # تنظيف البيانات - حذف صفوف "Total records in this page" وما تحتها
        print("🧹 Cleaning data from both files...")
        active_data_df = clean_dataframe(active_data_df, active_file)
        completed_df = clean_dataframe(completed_df, completed_file)

        # التحقق من عدد الأعمدة وتطبيق التوافق
        print("🔄 Applying Active Requisitions column headers to data...")

        if len(active_headers) != len(completed_df.columns):
            print(f"⚠️  Column count mismatch - Adjusting...")
            print(f"   Active headers: {len(active_headers)} columns")
            print(f"   Completed data: {len(completed_df.columns)} columns")

            # استخدام الحد الأدنى من الأعمدة لتجنب الأخطاء
            min_cols = min(len(active_headers), len(completed_df.columns))
            active_headers_adjusted = active_headers[:min_cols]
            completed_df = completed_df.iloc[:, :min_cols]
            active_data_df = active_data_df.iloc[:, :min_cols]
            print(f"   ✅ Adjusted to use first {min_cols} columns")

            # تطبيق رؤوس الأعمدة المعدلة
            completed_df.columns = active_headers_adjusted
            active_data_df.columns = active_headers_adjusted
            final_headers = active_headers_adjusted
        else:
            # تطبيق رؤوس الأعمدة من ملف Active على البيانات
            completed_df.columns = active_headers
            active_data_df.columns = active_headers
            final_headers = active_headers
            print(f"✅ Column count matches - Applied Active headers to both datasets")

        print(f"📋 Final column headers used: {final_headers}")

        # إضافة عمود لتمييز نوع البيانات
        active_data_df['Status_Type'] = 'Active'
        completed_df['Status_Type'] = 'Completed'

        print("🔗 Merging data from both files...")
        # دمج البيانات (بدون رؤوس الأعمدة المكررة)
        merged_df = pd.concat([active_data_df, completed_df], ignore_index=True)

        print(f"✅ Files merged successfully - Total rows: {len(merged_df)}")

        # حذف الأعمدة غير المرغوب فيها
        merged_df = remove_unwanted_columns(merged_df)

        # إضافة عمود "On Time"
        merged_df = add_on_time_column(merged_df)

        # إضافة أعمدة التاريخ (الشهر، السنة، اليوم)
        merged_df = add_date_columns(merged_df)

        # إضافة عمود "Total Active"
        merged_df = add_total_active_column(merged_df)

        # إضافة عمود "Total On Time"
        merged_df = add_total_on_time_column(merged_df)

        # إضافة عمود "Region"
        merged_df = add_region_column(merged_df)

        # إضافة عمود "TA SPV"
        merged_df = add_ta_spv_column(merged_df)

        # إنشاء اسم الملف المدموج
        output_filename = "Merged_Requisitions.csv"
        output_path = os.path.join(files_directory, output_filename)

        print(f"💾 Saving merged file as: {output_filename}")
        # حفظ الملف المدموج
        merged_df.to_csv(output_path, index=False, encoding='utf-8-sig')

        print("✅ File merge completed successfully!")
        print(f"📁 Merged file saved to: {output_path}")
        print(f"📊 Total records (after cleaning): {len(merged_df)}")
        print(f"📊 Active records (after cleaning): {len(active_data_df)}")
        print(f"📊 Completed records (after cleaning): {len(completed_df)}")

        # عرض معلومات إضافية عن الأعمدة
        print(f"📋 Columns in merged file: {list(merged_df.columns)}")
        print("🧹 Note: Rows containing 'Total records in this page' and below were removed")
        
        return True
        
    except Exception as e:
        print(f"❌ Unexpected error during merge: {str(e)}")
        return False

def main():
    """
    الدالة الرئيسية
    """
    print("=" * 50)
    print("🔄 ZOHO FILES MERGER")
    print("=" * 50)
    
    success = merge_csv_files()
    
    if success:
        print("\n🎉 Process completed successfully!")
    else:
        print("\n❌ Process failed!")
    
    print("=" * 50)





if __name__ == "__main__":
    main()
