# /// script
# requires-python = ">=3.14"
# dependencies = [
#     "pandas>=3.0.0",
# ]
# ///

import uuid
from pathlib import Path

import pandas as pd

# HOW TO USE:
# - configure the three variables below
# - run `uv run umami_import.py`
# Enter the website ID from your new self-hosted instance
NEW_WEBSITE_ID = ""
# Folder where the export files from your cloud instance are saved
EXPORT_FOLDER = ""
# Folder where you want to save your generated import files
IMPORT_FOLDER = ""

# Based on scripts from https://github.com/RoversX/umami-csv-import-script with several improvements

def main(
    website_id_new: str,
    website_event_old_file: Path,
    session_new_file: Path,
    website_event_new_file: Path,
    event_data_old_file: Path,
    event_data_new_file: Path,
) -> None:
    """Runs all table generation functions."""

    generate_website_event(
        website_id_new=website_id_new,
        website_event_old_file=website_event_old_file,
        website_event_new_file=website_event_new_file,
    )
    generate_session(
        website_id_new=website_id_new,
        website_event_old_file=website_event_old_file,
        session_new_file=session_new_file,
    )
    generate_event_data(
        website_event_old_file=website_event_old_file,
        event_data_old_file=event_data_old_file,
        event_data_new_file=event_data_new_file,
        website_id_new=website_id_new,
    )


def generate_website_event(
    website_id_new: str,
    website_event_old_file: Path,
    website_event_new_file: Path,
):
    """Generate a new website_event table using the website_event file from the Umami data export. Replaces the old website_id with website_id_new."""

    # Load the original CSV file
    website_event_old = pd.read_csv(website_event_old_file)

    # Update the website_id column with the user-provided website ID
    website_event_old["website_id"] = website_id_new

    # Define the columns required for the website_event table
    # Check in psql with `\d website_event`
    # Must be in exactly the same order as the table schema
    website_event_columns = [
        "event_id",
        "website_id",
        "session_id",
        "created_at",
        "url_path",
        "url_query",
        "referrer_path",
        "referrer_query",
        "referrer_domain",
        "page_title",
        "event_type",
        "event_name",
        "visit_id",
        "tag",
        "fbclid",
        "gclid",
        "li_fat_id",
        "msclkid",
        "ttclid",
        "twclid",
        "utm_campaign",
        "utm_content",
        "utm_medium",
        "utm_source",
        "utm_term",
        "hostname",
    ]

    # Create a new DataFrame for the website_event data with the required columns
    df_website_event = website_event_old[website_event_columns]
    website_event_unique = df_website_event.groupby(
        "event_id", as_index=False
    ).aggregate("first")

    # Save the new website_event data to a CSV file
    website_event_new_file.parent.mkdir(exist_ok=True, parents=True)
    website_event_unique.to_csv(website_event_new_file, index=False)

    print(f"Successfully generated {website_event_new_file}")


def generate_session(
    website_id_new: str,
    website_event_old_file: Path,
    session_new_file: Path,
):
    """Generate a new session table using the website_event file from the Umami data export. Replaces the old website_id with website_id_new."""

    # Load the original CSV file
    website_event_old = pd.read_csv(website_event_old_file)

    # Update the website_id column with the user-provided website ID
    website_event_old["website_id"] = website_id_new

    # Define the columns required for the session table
    # Check in psql with `\d session`
    # Must be in exactly the same order as the table schema
    sessions_aggregations = {
        # 'session_id' will be the key for the aggregation
        "website_id": "first",
        "browser": "first",
        "os": "first",
        "device": "first",
        "screen": "first",
        "language": "first",
        "country": "first",
        "region": "first",
        "city": "first",
        "created_at": "min",  # pick first date-time
        "distinct_id": "first",
    }

    unique_sessions = website_event_old.groupby("session_id", as_index=False).aggregate(
        sessions_aggregations
    )

    # Save the new session data to a CSV file
    session_new_file.parent.mkdir(exist_ok=True, parents=True)
    unique_sessions.to_csv(session_new_file, index=False)

    print(f"Successfully generated {session_new_file}")


def generate_event_data(
    website_id_new: str,
    website_event_old_file: Path,
    event_data_old_file: Path,
    event_data_new_file: Path,
):
    """
    Generate a new event_data table using the website_event and event_data files from the Umami data export. Replaces the old website_id with website_id_new.

    The event_id column from the event_data export file may not always match event_id entries from the website_event export file. If this is the case, the function will try to find a match based on unique timestamps if possible. If there are still event_data rows that are not successfully matched to website_event rows, the function will print a warning about it and output a file of the un-matched event_data rows.
    """

    # skip if there is no event_data_old file
    if not event_data_old_file.exists():
        print(
            f"No event_data export file found at {event_data_old_file}, skipping event_data import file generation."
        )
        return

    event_data_old = pd.read_csv(event_data_old_file)

    # skip if there is no event data
    if len(event_data_old) == 0:
        print(
            f"Event data export file found at {event_data_old_file} is empty, skipping event event_data import file generation."
        )
        return

    website_event_old = pd.read_csv(website_event_old_file)

    # Check in psql with `\d event_data`
    # Must be in exactly the same order as the table schema
    event_data_columns = [
        "event_data_id",
        "website_id",
        "website_event_id",
        "data_key",
        "string_value",
        "number_value",
        "date_value",
        "data_type",
        "created_at",
    ]

    # website_event_id is further checked to make sure it matches website_event table, so column name is prefixed with '_' here
    event_data_new = pd.DataFrame().assign(
        event_data_id=[uuid.uuid4() for _ in range(len(event_data_old.index))],
        website_id=website_id_new,
        _website_event_id=event_data_old["event_id"],
        data_key=event_data_old["data_key"],
        string_value=event_data_old["string_value"],
        number_value=event_data_old["number_value"],
        date_value=event_data_old["date_value"],
        data_type=event_data_old["data_type"],
        created_at=event_data_old["created_at"],
    )

    # pull matches betweeen _website_event_id and website_event.event_id
    # both direct matches and cases where there is a 1:1 match using created_at
    event_data_new = event_data_new.merge(
        pd.DataFrame().assign(
            _website_event_id=website_event_old["event_id"],
            _weid_match=website_event_old["event_id"],
        ),
        how="left",
        on="_website_event_id",
    ).merge(
        pd.DataFrame()
        .assign(
            created_at=website_event_old["created_at"],
            _weid_created_at_match=website_event_old["event_id"],
        )
        .drop_duplicates(subset=["created_at"], keep=False),
        how="left",
        on="created_at",
    )

    # assign website_event_id based on direct matches with a fallback to matches based on 1:1 created_at matches
    event_data_new = event_data_new.assign(
        website_event_id=event_data_new["_weid_match"].fillna(
            event_data_new["_weid_created_at_match"]
        )
    )

    # replace \N string for null with None
    event_data_new = event_data_new.replace(r"\N", None)

    # set column order to match db schema
    event_data_new = event_data_new[event_data_columns]

    # warn and save a list of unmatched events, if there are any
    unmatched_event_data = event_data_new[event_data_new["website_event_id"].isna()]
    if len(unmatched_event_data) > 0:
        unmatched_path = event_data_new_file.with_name(
            event_data_new_file.name + "_unmatched"
        )
        print(
            f"{len(unmatched_event_data)} event_data rows were unable to be matched to a website event. Saving list of unmatched entries to {unmatched_path}"
        )
        unmatched_path.parent.mkdir(exist_ok=True, parents=True)
        unmatched_event_data.to_csv(unmatched_path, index=False)

    # save validated event data to csv
    print(f"Successfully generated {event_data_new_file}")
    matched_event_data = event_data_new[~event_data_new["website_event_id"].isna()]
    event_data_new_file.parent.mkdir(exist_ok=True, parents=True)
    matched_event_data.to_csv(event_data_new_file, index=False)


if __name__ == "__main__":
    main(
        website_id_new=NEW_WEBSITE_ID,
        website_event_old_file=Path(EXPORT_FOLDER) / "website_event.csv",
        session_new_file=Path(IMPORT_FOLDER) / "session_new.csv",
        website_event_new_file=Path(IMPORT_FOLDER) / "website_event_new.csv",
        event_data_old_file=Path(EXPORT_FOLDER) / "event_data.csv",
        event_data_new_file=Path(IMPORT_FOLDER) / "event_data_new.csv",
    )