"""
build_dvf_historique.py
=======================
Download raw DGFIP DVF files (2015-2020) for department Nord (59),
enrich with parcel-level GPS, and save as data/{year}/59.csv
matching the geo-DVF format used by 2021-2025 files.

GPS strategy (two-level):
  1. Parcel centroid from cadastre (accurate, ~14-char parcel ID lookup)
     → requires data/2026/parcelles_59_centroids.json built by
       python build_parcel_centroids_59.py
  2. Commune centroid from API Geo (fallback when parcel ID not found)

Source: http://data.cquest.org/dgfip_dvf/
  Raw DGFIP pipe-delimited files (no GPS, French column names, comma decimals).
  Column mapping and format conversion to geo-DVF standard is handled here.

Parcel ID construction from raw DGFIP fields:
  id = dept(2) + commune(3) + prefixe(3) + section(2) + no_plan(4)
  e.g. "59350000AK0216"  (matches etalab cadastre format)

Usage:
    python build_parcel_centroids_59.py   # build GPS cache first (one-time)
    python build_dvf_historique.py
    python build_dvf_historique.py --force
    python build_dvf_historique.py --annees 2018 2019 2020
"""

from __future__ import annotations

import argparse
import csv
import io
import json
import logging
import os
import urllib.request
from datetime import datetime

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger("build_dvf_historique")

_DEPT     = "59"
_DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
_ANNEES   = [2015, 2016, 2017, 2018, 2019, 2020]

_PARCEL_CENTROIDS_PATH = os.path.join(_DATA_DIR, "2026", "parcelles_59_centroids.json")

# Raw DGFIP national files — stream-parsed and filtered to dept 59
_SOURCES: dict[int, str] = {
    2015: "http://data.cquest.org/dgfip_dvf/201904/valeursfoncieres-2015.txt",
    2016: "http://data.cquest.org/dgfip_dvf/202010/valeursfoncieres-2016.txt",
    2017: "http://data.cquest.org/dgfip_dvf/202010/valeursfoncieres-2017.txt",
    2018: "http://data.cquest.org/dgfip_dvf/202010/valeursfoncieres-2018.txt",
    2019: "http://data.cquest.org/dgfip_dvf/202010/valeursfoncieres-2019.txt",
    2020: "http://data.cquest.org/dgfip_dvf/202010/valeursfoncieres-2020.txt",
}

_API_GEO = (
    "https://geo.api.gouv.fr/communes"
    "?codeDepartement=59&fields=code,centre&format=json&limit=700"
)

_OUT_COLS = [
    "id_mutation", "date_mutation", "nature_mutation", "valeur_fonciere",
    "adresse_numero", "adresse_nom_voie", "code_postal",
    "code_commune", "nom_commune", "code_departement",
    "type_local", "surface_reelle_bati", "latitude", "longitude",
]


# ---------------------------------------------------------------------------
# GPS lookups
# ---------------------------------------------------------------------------

def _load_parcel_centroids() -> dict[str, tuple[float, float]]:
    """
    Load parcel-level centroids from cache built by build_parcel_centroids_59.py.
    Returns {} and logs a warning if the cache is absent.
    """
    if not os.path.exists(_PARCEL_CENTROIDS_PATH):
        log.warning(
            "Parcel centroid cache not found: %s\n"
            "  Run: python build_parcel_centroids_59.py\n"
            "  Falling back to commune centroids only (less accurate IRIS join).",
            _PARCEL_CENTROIDS_PATH,
        )
        return {}

    log.info("Loading parcel centroid cache…")
    with open(_PARCEL_CENTROIDS_PATH, encoding="utf-8") as f:
        raw = json.load(f)
    result = {k: (float(v[0]), float(v[1])) for k, v in raw.items()}
    log.info("  %d parcel centroids loaded", len(result))
    return result


def _fetch_commune_centroids() -> dict[str, tuple[float, float]]:
    """Return {code_insee_5: (lat, lon)} for all Nord (59) communes."""
    log.info("Fetching commune centroids from API Geo (fallback GPS)…")
    req = urllib.request.Request(
        _API_GEO, headers={"User-Agent": "agent-immobilier/1.0"}
    )
    with urllib.request.urlopen(req, timeout=30) as resp:
        data = json.loads(resp.read().decode("utf-8"))

    result: dict[str, tuple[float, float]] = {}
    for c in data:
        code   = c.get("code", "")
        coords = (c.get("centre") or {}).get("coordinates", [])
        if len(coords) >= 2:
            result[code] = (float(coords[1]), float(coords[0]))  # (lat, lon)

    log.info("  %d commune centroids loaded", len(result))
    return result


# ---------------------------------------------------------------------------
# Row conversion helpers
# ---------------------------------------------------------------------------

def _parse_date(raw: str) -> str:
    """Convert DD/MM/YYYY → YYYY-MM-DD; return raw if unparseable."""
    s = raw.strip()
    try:
        return datetime.strptime(s, "%d/%m/%Y").strftime("%Y-%m-%d")
    except ValueError:
        return s


def _parse_float(raw: str) -> str:
    """Convert French comma-decimal string to period-decimal."""
    return raw.strip().replace(" ", "").replace(",", ".")


def _build_code_commune(code_dept: str, code_commune_raw: str) -> str:
    """
    Build the full 5-digit INSEE commune code.
    Raw DGFIP: Code departement='59', Code commune='350' → '59350'.
    """
    return code_dept.strip().zfill(2) + code_commune_raw.strip().zfill(3)


def _build_parcel_id(
    code_dept:        str,
    code_commune_raw: str,
    prefixe:          str,
    section:          str,
    no_plan:          str,
) -> str:
    """
    Build the 14-char etalab parcel ID from raw DGFIP fields.

    Format: dept(2) + commune(3) + prefixe(3) + section(2) + plan(4)
    Examples:
      dept=59, commune=350, prefixe='', section='AK', plan='216'
      → '59350000AK0216'

      dept=59, commune=350, prefixe='', section='B', plan='2378'
      → '59350000B02378'  (single-char section padded left: B → 0B)
    """
    commune5 = code_dept.strip().zfill(2) + code_commune_raw.strip().zfill(3)
    pref3    = (prefixe or "").strip().zfill(3)   # '' → '000', '001' → '001'
    sect2    = (section or "").strip().upper().zfill(2)  # 'B' → '0B', 'AK' → 'AK'
    plan4    = (no_plan or "").strip().zfill(4)   # '216' → '0216'
    return commune5 + pref3 + sect2 + plan4


def _make_id(code_commune: str, date_raw: str, no_disposition: str, valeur_fonciere_raw: str) -> str:
    """
    Stable id_mutation grouping lots of the same acte.

    'No disposition' in raw DGFIP is a per-lot counter within each acte (1, 2, 3 …),
    not a globally unique acte identifier.  Multiple unrelated actes on the same day
    can all have No disposition = 1.  Including the total price (valeur_fonciere)
    separates them: same date + same price + same disposition → same acte;
    different price → different acte.
    """
    date_compact = date_raw.strip().replace("/", "")
    vf = valeur_fonciere_raw.strip().replace(" ", "").replace(",", ".")
    try:
        vf_key = f"{float(vf):.0f}" if vf else "0"
    except ValueError:
        vf_key = "0"
    return f"{code_commune}_{date_compact}_{no_disposition.strip().zfill(6)}_{vf_key}"


# ---------------------------------------------------------------------------
# Download and filter one year
# ---------------------------------------------------------------------------

def _dest_path(annee: int) -> str:
    return os.path.join(_DATA_DIR, str(annee), f"{_DEPT}.csv")


def telecharger_annee(
    annee:              int,
    force:              bool,
    parcel_centroids:   dict[str, tuple[float, float]],
    commune_centroids:  dict[str, tuple[float, float]],
) -> bool:
    dest = _dest_path(annee)
    if os.path.exists(dest) and not force:
        size_mb = os.path.getsize(dest) / 1_048_576
        log.info(
            "%d: already present (%s, %.1f Mo) — skip (--force to re-download)",
            annee, dest, size_mb,
        )
        return True

    url = _SOURCES.get(annee)
    if not url:
        log.error("%d: no source URL defined", annee)
        return False

    log.info("%d: streaming from %s", annee, url)
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    tmp = dest + ".tmp"

    count_in        = 0
    count_out       = 0
    gps_parcel      = 0  # rows with parcel-level GPS
    gps_commune     = 0  # rows with commune-centroid fallback GPS
    gps_missing     = 0  # rows with no GPS at all

    try:
        req = urllib.request.Request(
            url, headers={"User-Agent": "agent-immobilier/1.0"}
        )
        with urllib.request.urlopen(req, timeout=600) as resp:
            total_bytes = int(resp.headers.get("Content-Length", 0))
            log.info("  %d: file size %.1f Mo", annee, total_bytes / 1_048_576)

            text_stream = io.TextIOWrapper(resp, encoding="latin-1")
            reader = csv.DictReader(text_stream, delimiter="|")

            with open(tmp, "w", encoding="utf-8", newline="") as out_f:
                writer = csv.DictWriter(out_f, fieldnames=_OUT_COLS)
                writer.writeheader()

                for row in reader:
                    count_in += 1

                    code_dept = (row.get("Code departement") or "").strip()
                    if code_dept != _DEPT:
                        continue

                    code_commune_raw = (row.get("Code commune") or "").strip()
                    code_commune     = _build_code_commune(code_dept, code_commune_raw)

                    # GPS: parcel centroid → commune centroid → missing
                    parcel_id = _build_parcel_id(
                        code_dept,
                        code_commune_raw,
                        row.get("Prefixe de section") or "",
                        row.get("Section") or "",
                        row.get("No plan") or "",
                    )
                    if parcel_id in parcel_centroids:
                        lat, lon = parcel_centroids[parcel_id]
                        gps_parcel += 1
                    elif code_commune in commune_centroids:
                        lat, lon = commune_centroids[code_commune]
                        gps_commune += 1
                    else:
                        lat, lon = None, None
                        gps_missing += 1

                    date_raw = (row.get("Date mutation") or "").strip()
                    no_disp  = (row.get("No disposition") or "").strip()

                    out_row = {
                        "id_mutation":         _make_id(
                            code_commune, date_raw, no_disp,
                            row.get("Valeur fonciere") or "",
                        ),
                        "date_mutation":       _parse_date(date_raw),
                        "nature_mutation":     (row.get("Nature mutation") or "").strip(),
                        "valeur_fonciere":     _parse_float(row.get("Valeur fonciere") or ""),
                        "adresse_numero":      (row.get("No voie") or "").strip(),
                        "adresse_nom_voie":    (row.get("Voie") or "").strip(),
                        "code_postal":         (row.get("Code postal") or "").strip().zfill(5),
                        "code_commune":        code_commune,
                        "nom_commune":         (row.get("Commune") or "").strip(),
                        "code_departement":    code_dept,
                        "type_local":          (row.get("Type local") or "").strip(),
                        "surface_reelle_bati": _parse_float(
                            row.get("Surface reelle bati") or ""
                        ),
                        "latitude":  str(lat) if lat is not None else "",
                        "longitude": str(lon) if lon is not None else "",
                    }
                    writer.writerow(out_row)
                    count_out += 1

                    if count_in % 500_000 == 0:
                        log.info(
                            "  %d: %d rows read, %d for dept 59 "
                            "(parcel GPS: %d, commune GPS: %d, missing: %d)",
                            annee, count_in, count_out,
                            gps_parcel, gps_commune, gps_missing,
                        )

    except Exception as exc:
        log.error("%d: download failed — %s", annee, exc)
        if os.path.exists(tmp):
            os.remove(tmp)
        return False

    os.replace(tmp, dest)
    size_mb = os.path.getsize(dest) / 1_048_576
    total_gps = gps_parcel + gps_commune
    parcel_pct = 100 * gps_parcel / total_gps if total_gps else 0

    log.info(
        "%d: saved → %s  (%.1f Mo, %d rows)",
        annee, dest, size_mb, count_out,
    )
    log.info(
        "%d: GPS — parcel: %d (%.0f%%), commune fallback: %d, missing: %d",
        annee, gps_parcel, parcel_pct, gps_commune, gps_missing,
    )
    return True


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download raw DGFIP DVF files (2015-2020) filtered to Nord (59)"
    )
    parser.add_argument(
        "--force", action="store_true",
        help="Re-download even if the file already exists",
    )
    parser.add_argument(
        "--annees", nargs="+", type=int, default=_ANNEES,
        help=f"Years to download (default: {_ANNEES})",
    )
    args = parser.parse_args()

    print()
    print("=" * 60)
    print("  DVF historique Nord (59) — 2015-2020")
    print("=" * 60)
    print(f"  Annees     : {sorted(args.annees)}")
    print(f"  Repertoire : {_DATA_DIR}")
    print(f"  Source     : data.cquest.org/dgfip_dvf (raw DGFIP)")
    print(f"  GPS lvl 1  : parcel centroids (cadastre, if cache present)")
    print(f"  GPS lvl 2  : commune centroids (API Geo, fallback)")
    print()

    parcel_centroids  = _load_parcel_centroids()
    commune_centroids = _fetch_commune_centroids()

    ok_count  = 0
    err_count = 0
    for annee in sorted(args.annees):
        if annee not in _SOURCES:
            log.error("%d: not in supported range %s", annee, list(_SOURCES))
            err_count += 1
            continue
        success = telecharger_annee(
            annee, args.force, parcel_centroids, commune_centroids
        )
        if success:
            ok_count += 1
        else:
            err_count += 1

    print()
    print("=" * 60)
    print(f"  Resultat : {ok_count} OK  |  {err_count} erreur(s)")
    if ok_count > 0:
        print()
        print("  Prochaines etapes :")
        print("    python build_mutations_iris.py  # rebuild mutations + prix_evolution_iris")
        print("    python build_pression_iris.py   # recalculate SPS/BPS/NPI")
        print("    python build_iris_data.py       # update iris_prix.json")
    print("=" * 60)


if __name__ == "__main__":
    main()
