#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Synthèse des logs par origine pour la page Manon2026.

Usage :
  python3 synthese_logs_origine.py /var/log/apache2/access.log
  python3 synthese_logs_origine.py /var/log/nginx/access.log --filtre Manon2026 --sortie synthese-logs-origine.csv

Le script lit les logs Apache/Nginx au format "combined", filtre les lignes contenant Manon2026,
puis produit un CSV groupé par origine :
- domaine référent externe si présent ;
- sinon adresse IP source.

Colonnes générées :
origine, ip, referent, visites, telechargements, erreurs, octets, premiere_vue, derniere_vue, pages, user_agents
"""

from __future__ import annotations

import argparse
import csv
import gzip
import re
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse, unquote


LOG_RE = re.compile(
    r'(?P<ip>\S+) \S+ \S+ \[(?P<date>[^\]]+)\] '
    r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<proto>[^"]*)" '
    r'(?P<status>\d{3}) (?P<size>\S+) '
    r'"(?P<referer>[^"]*)" "(?P<ua>[^"]*)"'
)

VIDEO_EXTENSIONS = (".mp4", ".mov", ".m4v", ".webm")


def ouvrir_log(path: Path):
    if path.suffix == ".gz":
      return gzip.open(path, "rt", encoding="utf-8", errors="replace")
    return path.open("r", encoding="utf-8", errors="replace")


def parse_date(value: str) -> str:
    # Exemple : 27/May/2026:14:38:12 +0200
    try:
        dt = datetime.strptime(value, "%d/%b/%Y:%H:%M:%S %z")
        return dt.isoformat(timespec="seconds")
    except ValueError:
        return value


def domaine_referent(referer: str) -> str:
    if not referer or referer == "-":
        return ""

    parsed = urlparse(referer)
    if parsed.netloc:
        return parsed.netloc.lower()

    return referer


def est_telechargement(url: str) -> bool:
    path = urlparse(url).path.lower()
    return path.endswith(VIDEO_EXTENSIONS)


def raccourcir_liste(values, limite=6):
    propres = [v for v in sorted(values) if v and v != "-"]
    if not propres:
        return ""
    if len(propres) <= limite:
        return " | ".join(propres)
    return " | ".join(propres[:limite]) + f" | +{len(propres) - limite} autre(s)"


def main():
    parser = argparse.ArgumentParser(description="Synthèse des logs par origine.")
    parser.add_argument("logs", nargs="+", help="Fichier(s) access.log ou access.log.gz")
    parser.add_argument("--filtre", default="Manon2026", help="Texte à rechercher dans l'URL")
    parser.add_argument("--sortie", default="synthese-logs-origine.csv", help="Fichier CSV de sortie")
    args = parser.parse_args()

    synthese = defaultdict(lambda: {
        "ip": set(),
        "referent": set(),
        "visites": 0,
        "telechargements": 0,
        "erreurs": 0,
        "octets": 0,
        "premiere_vue": "",
        "derniere_vue": "",
        "pages": set(),
        "user_agents": set(),
    })

    lignes_lues = 0
    lignes_filtrees = 0

    for log_path in map(Path, args.logs):
        with ouvrir_log(log_path) as f:
            for line in f:
                lignes_lues += 1
                match = LOG_RE.match(line)
                if not match:
                    continue

                data = match.groupdict()
                url = unquote(data["url"])

                if args.filtre and args.filtre not in url:
                    continue

                lignes_filtrees += 1

                ip = data["ip"]
                referer = data["referer"]
                referer_domain = domaine_referent(referer)
                origine = referer_domain or ip
                status = int(data["status"])
                size = 0 if data["size"] == "-" else int(data["size"])
                date = parse_date(data["date"])
                path = urlparse(url).path

                row = synthese[origine]
                row["ip"].add(ip)
                row["referent"].add(referer)
                row["visites"] += 1
                row["octets"] += size
                row["pages"].add(path)
                row["user_agents"].add(data["ua"])

                if est_telechargement(url):
                    row["telechargements"] += 1

                if status >= 400:
                    row["erreurs"] += 1

                if not row["premiere_vue"] or date < row["premiere_vue"]:
                    row["premiere_vue"] = date

                if not row["derniere_vue"] or date > row["derniere_vue"]:
                    row["derniere_vue"] = date

    sortie = Path(args.sortie)
    with sortie.open("w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=";")
        writer.writerow([
            "origine",
            "ip",
            "referent",
            "visites",
            "telechargements",
            "erreurs",
            "mo_transferes",
            "premiere_vue",
            "derniere_vue",
            "pages",
            "user_agents",
        ])

        for origine, row in sorted(synthese.items(), key=lambda item: item[1]["visites"], reverse=True):
            writer.writerow([
                origine,
                raccourcir_liste(row["ip"]),
                raccourcir_liste(row["referent"]),
                row["visites"],
                row["telechargements"],
                row["erreurs"],
                round(row["octets"] / 1024 / 1024, 2),
                row["premiere_vue"],
                row["derniere_vue"],
                raccourcir_liste(row["pages"]),
                raccourcir_liste(row["user_agents"], limite=3),
            ])

    print(f"Lignes lues : {lignes_lues}")
    print(f"Lignes retenues : {lignes_filtrees}")
    print(f"Synthèse créée : {sortie}")


if __name__ == "__main__":
    main()
