stage/extract_dcox.py at main · Komodo-source/stage · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Extrait un livret d'accueil (.docx ou données brutes) et l'envoie vers Airtable.

Usage:
    python airtable_import.py <input_file.docx> [--dry-run]

Tables Airtable ciblées:
    - Maison
    - Proprietaire
    - Livret
    - Piscine
    - DispositionMaison
    - equipement
    - WorkFlowBreezeway  (check-in / check-out tasks)
    - ConditionLocation  (conditions de location)
"""

import argparse
import os
import zipfile
import xml.etree.ElementTree as ET
import re
import json
import sys
from selenium import webdriver
import json
import time
from bs4 import BeautifulSoup
import requests

def strip(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)


def clean_extracted_data(data: dict) -> dict:
    """Clean common formatting issues from Breezeway extraction"""

    # Clean house name: remove prefixes like "zAutre - M7 - "
    if data.get("nom_maison"):
        # Keep only the meaningful name after the last " - "
        parts = data["nom_maison"].split(" - ")
        data["nom_maison"] = parts[-1] if len(parts) > 1 else data["nom_maison"]

    # Clean address: remove trailing commas/spaces
    if data.get("adresse"):
        data["adresse"] = re.sub(r'[,;\s]+$', '', data["adresse"].strip())

    # Clean WiFi credentials (remove extra whitespace)
    if data.get("name_wifi"):
        data["name_wifi"] = data["name_wifi"].strip()
    if data.get("mdp_wifi"):
        data["mdp_wifi"] = data["mdp_wifi"].strip()

    if data.get("recommandation"):
            for reco in data["recommandation"].values():
                reco["rating"] = reco.get("rating") or 0
                reco["photo"] = reco.get("photo") or ""
                reco["summary"] = reco.get("summary") or ""
                reco["latitude"] = reco.get("latitude") or 0
                reco["longitude"] = reco.get("longitude") or 0
                reco["formatted_address"] = reco.get("formatted_address") or reco.get("city", "")


    return data

def extract_content(html):
    soup = BeautifulSoup(html, "html.parser")

    images = []
    videos = []
    img_counter = [0]
    vid_counter = [0]

    for img in soup.find_all("img"):
        src = img.get("src")
        if src:
            images.append(src)
            beacon = soup.new_string(f" $i{img_counter[0]} ")
            img.replace_with(beacon)
            img_counter[0] += 1

    for iframe in soup.find_all("iframe"):
        src = iframe.get("src")
        if src:
            videos.append(src)
            beacon = soup.new_string(f" $v{vid_counter[0]} ")
            iframe.replace_with(beacon)
            vid_counter[0] += 1

    text = soup.get_text(separator="\n").strip()

    return {
        "text": text,
        "images": images,
        "videos": videos
    }


def extract_data_breezeway():
    lst_url = []

    return_value = {}
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--enable-logging')
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})

    driver = webdriver.Chrome(options=chrome_options)

    print("Navigating to the page...")
    driver.get("https://guide.breezeway.io/NpayeOvoM-Q/home/page/160614")

    # Wait for network requests to fire and resolve
    print("Waiting 10 seconds for requests to complete...")
    time.sleep(10)

    # Capture network log entries
    log_entries = driver.get_log("performance")
    print(f"Total performance logs captured: {len(log_entries)}\n")
    print("-" * 50)

    found_target = False

    for entry in log_entries:
        try:
            message_obj = json.loads(entry.get("message", "{}"))
            message = message_obj.get("message", {})
            method = message.get("method", "")

            if method == 'Network.responseReceived':
                params = message.get('params', {})
                response = params.get('response', {})

                response_url = response.get('url', '')
                response_code = response.get('status', '')
                #print(response_code)
                #print(response_url)
                if "https://api.breezeway.io/public/guides" in response_url:
                    print("✅ Value found json ")
                    print("retrieving values")
                    print(response_url)
                    lst_url.append(response_url)
                    data = requests.get(response_url).json()

                    return_value["nom_maison"] = (
                        data["home"]["name"]
                    )

                    return_value["adresse"] = (
                    data["home"]["address"]["address1"] + ", " +
                    data["home"]["address"]["city"] + ", " +
                    (data["home"]["address"].get("state") or "")
                )
                return_value["CICO"] = (
                    data["company"]["defaults"]["checkin_time"] + ";" +
                    data["company"]["defaults"]["checkout_time"]
                )
                return_value["photo_maison"] = (
                    "https://images.breezeway.io/" +
                    data["home"]["photo"]["bucket"] + "/" +
                    data["home"]["photo"]["photo_key"]
                )

                for page in data["pages"]:
                    title = page["title"]

                    if title == "Bienvenue":
                        for section in page["sections"]:
                            if section["title"] == "Accès":
                                return_value["html_bienvenue"] = strip(section["blocks"][0]["data"])
                            elif section["title"] == "Wifi":
                                return_value["name_wifi"] = section["blocks"][0]["data"]["wifi_name"]
                                return_value["mdp_wifi"] = section["blocks"][0]["data"]["wifi_password"]
                            elif section["title"] == "Règles de la maison":
                                return_value["rules"] = strip(section["blocks"][0]["data"]["content"])

                    elif title == "Points d'Attention !":
                        return_value["point_attention"] = {
                            section["title"]: extract_content(section["blocks"][0]["data"])
                            for section in page["sections"]
                        }

                    elif title == "Équipements intérieurs":
                        return_value["equippement_intérieur"] = {
                            section["title"]: extract_content(section["blocks"][0]["data"])
                            for section in page["sections"]
                        }

                    elif title == "Équipements extérieurs":
                        return_value["equippement_extérieur"] = {}

                        for section in page["sections"]:
                            titre = section["title"]
                            if "Piscine" in titre or "piscine" in titre:
                                data = section["blocks"][0]["data"]
                                contenu = extract_content(data)

                                # Use robust parser
                                pool_parsed = parse_pool_instructions(contenu["text"])

                                return_value["instruction_ouverture_piscine"] = {
                                    "text": pool_parsed.get("ouverture", ""),
                                    "images": contenu.get("images", []),
                                    "videos": contenu.get("videos", [])
                                }

                                return_value["instruction_fermeture_piscine"] = {
                                    "text": pool_parsed.get("fermeture", ""),
                                    "images": contenu.get("images", []),
                                    "videos": contenu.get("videos", [])
                                }

                            else:
                                data = section["blocks"][0]["data"]
                                contenu = extract_content(data)
                                return_value["equippement_extérieur"][titre] = contenu

                    elif title == "Recommandations":
                        reco_list = page["sections"][0]["blocks"][0]["data"]
                        return_value["recommandation"] = {
                            item["name"]: item for item in reco_list
                        }

                    elif title == "Instructions de départ":
                        import re
                        v = extract_content(
                            page["sections"][0]["blocks"][0]["data"]["content"]
                        )
                        v["text"] =  re.sub(r'^[A-ZÉÈÀÙÂÊÎÔÛÇ\s]+$', lambda m: f"</p><h2 style=\"margin-top: 1rem\">{m.group().strip()}</h2><p>", v["text"], flags=re.MULTILINE)
                        v["text"] = v["text"].replace("\n", " ")
                        return_value["instruction_depart"] = v

                found_target = True


        except Exception as e:
            print(f"Error parsing log: {e}")

    if not found_target:
        print("\nNo URLs containing 'makemytrip' were found.")

    driver.quit()
    return return_value


try:
    import requests
except ImportError:
    print("Erreur : 'requests' non installé. Lancez : pip install requests")
    sys.exit(1)

TOKEN   = os.environ.get("AIRTABLE_TOKEN", "pateBMBdl8UARzfUe.966a1390153383e1bd6dd3a7452dab235d0f04f6fd56cf499201964ce837acce")
BASE_ID = os.environ.get("AIRTABLE_BASE_ID", "appLWhCKR6pEGE02s")

T_MAISON       = "Maison"
T_LISTEMAISON       = "ListeMaison"
T_PROPRIETAIRE = "Proprietaire"
T_LIVRET       = "Livret"
T_PISCINE      = "Piscine"
T_DISPOSITION  = "DispositionMaison"
T_EQUIPEMENT   = "equipement"
T_RECOMMANDATION   = "Recommandation"
T_WORKFLOW     = "WorkFlowBreezeway"
T_CONDITION    = "ConditionLocation"

NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

SECTION_KEYWORDS = [
    ("accès à la maison",          "ACCES_MAISON"),
    ("fonctionnement de la maison","FONCTIONNEMENT"),
    ("fonctionnement",             "FONCTIONNEMENT"),
    ("gestion des locs",           "GESTION"),
    ("équipements bébés",          "BEBES"),
    ("known issues",               "ISSUES"),
    ("propriétaires",              "PROPRIETAIRES"),
    ("process ménage",             "PROCESS"),
    ("récap",                      "RECAP"),
    ("maison",                     "MAISON"),
    ("accès",                      "ACCES"),
]

def detect_section(text: str) -> str | None:
    t = text.strip().lower()
    t = t.strip("*").strip()
    for keyword, section in SECTION_KEYWORDS:
        if keyword in t:
            return section
    return None

def _parse_xml_bytes(xml_bytes):
    root = ET.fromstring(xml_bytes)
    body = root.find("w:body", NS)
    if body is None:
        return []
    contents = []
    for child in body:
        tag = child.tag
        if tag == f'{{{NS["w"]}}}p':
            text = "".join(n.text or "" for n in child.findall(".//w:t", NS))
            if text.strip():
                contents.append(("paragraph", text.strip()))
        elif tag == f'{{{NS["w"]}}}tbl':
            table = []
            for row in child.findall("w:tr", NS):
                cells = [
                    "".join(n.text or "" for n in cell.findall(".//w:t", NS))
                    for cell in row.findall("w:tc", NS)
                ]
                table.append(cells)
            contents.append(("table", table))
    return contents


def extract_contents(path: str):
    """Retourne une liste de ('paragraph', str) | ('table', list[list[str]])."""
    try:
        from docx import Document
        doc = Document(path)
        contents = []
        # python-docx iterates paragraphs and tables in document order via body._element
        from docx.oxml.ns import qn
        body = doc.element.body
        for child in body:
            tag = child.tag
            if tag == qn("w:p"):
                text = "".join(run.text or "" for run in child.findall(f".//{qn('w:t')}"))
                if text.strip():
                    contents.append(("paragraph", text.strip()))
            elif tag == qn("w:tbl"):
                rows = []
                for tr in child.findall(f".//{qn('w:tr')}"):
                    cells = []
                    for tc in tr.findall(f"{qn('w:tc')}"):
                        cell_text = "".join(t.text or "" for t in tc.findall(f".//{qn('w:t')}"))
                        cells.append(cell_text.strip())
                    rows.append(cells)
                contents.append(("table", rows))
        return contents
    except ImportError:
        pass

    # Fallback: raw XML
    with zipfile.ZipFile(path, "r") as z:
        return _parse_xml_bytes(z.read("word/document.xml"))


def first_email(text: str) -> str:
    m = re.search(r"[\w.+-]+@[\w-]+\.[\w.-]+", text)
    return m.group(0) if m else ""

def first_phone(text: str) -> str:
    m = re.search(r"(?:(?:\+33|0)[1-9])(?:[\s.\-]?\d{2}){4}", text)
    return m.group(0) if m else ""

def first_url(text: str) -> str:
    m = re.search(r"https?://\S+", text)
    return m.group(0) if m else ""

def table_to_dict(table_rows) -> dict:
    """Transforme un tableau 2 colonnes (clé|valeur) en dict (clé en minuscule)."""
    d = {}
    for row in table_rows:
        if len(row) >= 2 and row[0].strip():
            d[row[0].strip().lower()] = row[1].strip()
    return d

def kv_get(d: dict, *keywords) -> str:
    """Cherche la première clé contenant l'un des mots-clés."""
    for kw in keywords:
        for k, v in d.items():
            if kw in k:
                return v
    return ""


def extract_youtube_urls(videos: list) -> list:
    """Safely extract YouTube URLs from a mixed list"""
    if not videos:
        return []
    return [
        v for v in videos
        if isinstance(v, str) and "youtube" in v.lower()
    ]


def getDataExcel(id_m):
    import pandas as pd
    df = pd.read_excel("data/responsable.xlsx", usecols=[0,2,3])
    #print(df)
    find = df.loc[df['id'] == id_m]
    #print(find)
    res = find["Responsable"].iloc[0].split(" ")
    eq = find["Équipe centrale"].iloc[0].split(" ")
    return (eq[0], res[0])


def map_equipement_category(raw_cat: str) -> list:
    cat = raw_cat.lower()
    if any(k in cat for k in ("extérieur", "exterieur", "jardin", "piscine", "plancha", "barbecue")):
        return ["extérieur"]
    if any(k in cat for k in ("intérieur", "interieur", "cuisine", "billard", "salon")):
        return ["intérieur"]
    if "alarme" in cat:
        return ["alarme"]
    if any(k in cat for k in ("wifi", "internet", "adsl", "fibre")):
        return ["wifi"]
    if any(k in cat for k in ("fonctionnement", "chauffage", "électricité", "electricite", "eau", "gaz")):
        return ["fonctionnement"]
    return ["autre"]


def _clean(s: str) -> str:
    """Nettoie les espaces excessifs."""
    return re.sub(r"\s+", " ", s).strip()


def parse_and_map_data(contents: list) -> dict:
    data = {
        "Nom Maison":                 "",
        "Adresse":                    "",
        "Lien Maps":                  "",
        "Instruction Acces Externe":  "",   # instructions pour les clients
        "Instruction Acces Interne":  "",   # notes internes (badge, codes…)
        "Portail":                    "",
        "Porte Entree":               "",
        "Boite Cle":                  "",
        "Jeux de Cle":                "",
        "Alarme Interne":             "",
        "Point Attention":            "",
        "Proprietaire Nom":           "",
        "Proprietaire Prenom":        "",
        "Proprietaire Email":         "",
        "Proprietaire Tel":           "",
        "Proprietaire Info":          "",
        # ── Gestion ──────────────────────────────────────────
        "Whatsapp":                   "",
        "Titulaire Annonce":          "",
        "Proprietaire Co Hote":       "",
        "Grille Prix":                "",
        "Periodes Bloquer":           "",
        # ── Process ──────────────────────────────────────────
        "Check In":                   "",
        "Check Out":                  "",
        "Delai Menage":               "",
        # ── Fonctionnement ───────────────────────────────────
        "Chauffage":                  "",
        "Electricite":                "",
        "Internet":                   "",
        "Wifi SSID":                  "",
        "Wifi Mdp":                   "",
        "Eau Chaude":                 "",
        "Poubelle":                   "",
        "Cheminee":                   "",
        "Jardin":                     "",
        "Linge":                      "",
        "Gaz Cuisine":                "",
        "Gaz Plancha":                "",
        "Frigo":                      "",
        "Equipements Cuisine":        "",
        "Equipements Interieurs":     "",
        "Equipements Exterieurs":     "",
        "Fenetre Volet":              "",
        "Espaces Prives":             "",
        # ── Piscine ──────────────────────────────────────────
        "Piscine Raw":                "",
        # ── Bébés ────────────────────────────────────────────
        "Bebes Raw":                  [],   # lignes brutes du tableau
        # ── Bain nordique ────────────────────────────────────
        "Bain Nordique":              "",
        # ── Listes ───────────────────────────────────────────
        "Pieces":      [],   # {Piece, Description, Etage, Type}
        "Issues":      [],   # {Categorie, Probleme, Statut}
        "Equipements": [],   # {Nom, Description, Categorie}
    }

    current_section = None

    _acces_ext_lines = []

    for item_type, item in contents:

        # ── Détection de section ─────────────────────────────
        if item_type == "paragraph":
            sec = detect_section(item)
            if sec:
                current_section = sec

            # Nom de la maison (ex: "M7 - La Bergerie du Vexin")
            if not data["Nom Maison"] and re.match(r"M\d+\s*[-–]", item):
                data["Nom Maison"] = _clean(re.split(r"[-–]", item, maxsplit=1)[-1])

            # Lien Google Maps
            if "maps" in item.lower() and not data["Lien Maps"]:
                data["Lien Maps"] = first_url(item) or item.strip()

            # Paragraphes d'accès externe
            if current_section == "ACCES":
                if not any(
                    kw in item.lower()
                    for kw in ("récap", "recap", "accès à la maison", "fonctionnement", "maps.app")
                ):
                    _acces_ext_lines.append(item)

        # ── Tables ───────────────────────────────────────────
        elif item_type == "table":
            kv = table_to_dict(item)

            # ── ACCES (adresse / alarme) ──────────────────────
            if current_section in ("ACCES", "RECAP"):
                data["Adresse"]        = data["Adresse"]        or kv_get(kv, "adresse")
                data["Alarme Interne"] = data["Alarme Interne"] or kv_get(kv, "alarme")

            # ── PROCESS ──────────────────────────────────────
            if current_section == "PROCESS":
                data["Check In"]      = data["Check In"]     or kv_get(kv, "check in", "checkin")
                data["Check Out"]     = data["Check Out"]    or kv_get(kv, "checkout", "check out")
                data["Delai Menage"]  = data["Delai Menage"] or kv_get(kv, "délai", "menage", "ménage")

            # ── GESTION ──────────────────────────────────────
            if current_section == "GESTION":
                data["Whatsapp"]          = data["Whatsapp"]          or kv_get(kv, "whatsapp")
                data["Titulaire Annonce"] = data["Titulaire Annonce"] or kv_get(kv, "titulaire")
                data["Proprietaire Co Hote"] = data["Proprietaire Co Hote"] or kv_get(kv, "co-hôte", "co hote", "cohote")
                data["Grille Prix"]       = data["Grille Prix"]       or kv_get(kv, "grille")
                data["Periodes Bloquer"]  = data["Periodes Bloquer"]  or kv_get(kv, "période", "periode", "bloquer")

            # ── PROPRIETAIRES ────────────────────────────────
            if current_section == "PROPRIETAIRES":
                raw_nom   = kv_get(kv, "nom")
                raw_tel   = kv_get(kv, "téléphone", "telephone", "tel")
                raw_email = kv_get(kv, "email")
                raw_info  = kv_get(kv, "enregistrement", "rib", "numéro")
                if raw_nom and not data["Proprietaire Nom"]:
                    full = raw_nom.split(":")[-1].strip()
                    parts = full.split(" ", 1)
                    data["Proprietaire Prenom"] = parts[0] if parts else ""
                    data["Proprietaire Nom"]    = parts[1] if len(parts) > 1 else full
                if raw_tel and not data["Proprietaire Tel"]:
                    data["Proprietaire Tel"] = first_phone(raw_tel) or raw_tel.split(":")[-1].strip()
                if raw_email and not data["Proprietaire Email"]:
                    src = raw_email.split("->")[-1] if "->" in raw_email else raw_email
                    data["Proprietaire Email"] = first_email(src)
                if raw_info and not data["Proprietaire Info"]:
                    data["Proprietaire Info"] = raw_info

            # ── MAISON (pièces) ───────────────────────────────
            if current_section == "MAISON":
                current_etage = ""
                for row in item:
                    if not row:
                        continue
                    label = row[0].strip()
                    label_low = label.lower()
                    # Sous-titre d'étage
                    if len(row) == 1 or (len(row) >= 2 and not row[1].strip()):
                        if any(k in label_low for k in ("rdc", "étage", "r+", "rez", "annexe", "principale", "secondaire")):
                            current_etage = label
                            continue
                    # Pièce avec description
                    if len(row) >= 2:
                        PIECE_KEYS = ("chambre", "sdb", "salle de bain", "salon", "wc",
                                      "cuisine", "mezzanine", "déboté", "debot",
                                      "biblioth", "couloir", "bureau")
                        if any(k in label_low for k in PIECE_KEYS):
                            if "chambre" in label_low:
                                ptype = "Chambre"
                            elif any(k in label_low for k in ("sdb", "salle de bain")):
                                ptype = "Salle de bain"
                            elif "salon" in label_low:
                                ptype = "Salon"
                            elif "cuisine" in label_low:
                                ptype = "Cuisine"
                            elif "wc" in label_low:
                                ptype = "WC"
                            else:
                                ptype = "Autre"
                            data["Pieces"].append({
                                "Piece":       label,
                                "Description": row[1].strip(),
                                "Etage":       current_etage,
                                "Type":        ptype,
                            })

            # ── BEBES ─────────────────────────────────────────
            if current_section == "BEBES":
                for row in item:
                    if any(k in (row[0].lower() if row else "") for k in ("bébé", "bebe", "lit", "chaise")):
                        data["Bebes Raw"].append(row)

            # ── ACCES_MAISON ──────────────────────────────────
            if current_section == "ACCES_MAISON":
                data["Portail"]        = data["Portail"]        or kv_get(kv, "portail")
                data["Jeux de Cle"]    = data["Jeux de Cle"]    or kv_get(kv, "clés", "cles")
                data["Alarme Interne"] = data["Alarme Interne"] or kv_get(kv, "alarme")
                data["Boite Cle"]      = data["Boite Cle"]      or kv_get(kv, "boîte à clés", "boite a cles", "boîte")

            # ── FONCTIONNEMENT ────────────────────────────────
            if current_section == "FONCTIONNEMENT":
                data["Chauffage"]             = data["Chauffage"]             or kv_get(kv, "chauffage")
                data["Electricite"]           = data["Electricite"]           or kv_get(kv, "électricité", "electricite", "tableau")
                data["Piscine Raw"]           = data["Piscine Raw"]           or kv_get(kv, "piscine")
                data["Poubelle"]              = data["Poubelle"]              or kv_get(kv, "poubelle")
                data["Cheminee"]              = data["Cheminee"]              or kv_get(kv, "cheminée", "cheminee", "poêle")
                data["Jardin"]                = data["Jardin"]                or kv_get(kv, "jardin")
                data["Linge"]                 = data["Linge"]                 or kv_get(kv, "linge")
                data["Gaz Cuisine"]           = data["Gaz Cuisine"]           or kv_get(kv, "gaz - cuisine", "gaz cuisine", "gaz\xa0cuisine")
                data["Gaz Plancha"]           = data["Gaz Plancha"]           or kv_get(kv, "gaz - plancha", "gaz plancha", "gaz\xa0plancha")
                data["Internet"]              = data["Internet"]              or kv_get(kv, "internet", "wifi", "adsl", "fibre")
                data["Frigo"]                 = data["Frigo"]                 or kv_get(kv, "frigo", "congélateur", "congelateur")
                data["Equipements Cuisine"]   = data["Equipements Cuisine"]   or kv_get(kv, "équipements de cuisine", "equipements cuisine", "équipements cuisine")
                data["Equipements Interieurs"]= data["Equipements Interieurs"]or kv_get(kv, "équipements intérieurs", "equipements interieurs")
                data["Equipements Exterieurs"]= data["Equipements Exterieurs"]or kv_get(kv, "équipements extérieurs", "equipements exterieurs")
                data["Fenetre Volet"]         = data["Fenetre Volet"]         or kv_get(kv, "fenêtre", "fenetre", "volet")
                data["Espaces Prives"]        = data["Espaces Prives"]        or kv_get(kv, "espaces privés", "espaces prives", "espace privé")
                data["Bain Nordique"]         = data["Bain Nordique"]         or kv_get(kv, "bain nordique", "jacuzzi", "spa", "bain")
                data["Eau Chaude"]            = data["Eau Chaude"]            or kv_get(kv, "eau chaude")

                # WiFi SSID / mdp depuis le champ Internet
                inet = data["Internet"]
                if inet:
                    m_ssid = re.search(r"(?:ssid|box|livebox)[^\n:]*[:\s]+([^\n]+)", inet, re.I)
                    m_mdp  = re.search(r"(?:mdp|mot de passe|password|clé wifi)[^\n:]*[:\s]+([^\n]+)", inet, re.I)
                    if m_ssid and not data["Wifi SSID"]:
                        data["Wifi SSID"] = m_ssid.group(1).strip()
                    if m_mdp and not data["Wifi Mdp"]:
                        data["Wifi Mdp"] = m_mdp.group(1).strip()

                # Équipements → table equipement
                for cat_label, cat_key, cat_mapped in [
                    ("Équipements cuisine",    "Equipements Cuisine",    "intérieur"),
                    ("Équipements intérieurs", "Equipements Interieurs", "intérieur"),
                    ("Équipements extérieurs", "Equipements Exterieurs", "extérieur"),
                    ("Chauffage",              "Chauffage",              "fonctionnement"),
                    ("Internet / Wifi",        "Internet",               "wifi"),
                    ("Piscine",                "Piscine Raw",            "extérieur"),
                    ("Bain nordique",          "Bain Nordique",          "extérieur"),
                    ("Gaz cuisine",            "Gaz Cuisine",            "fonctionnement"),
                    ("Gaz plancha",            "Gaz Plancha",            "fonctionnement"),
                    ("Cheminée",               "Cheminee",               "intérieur"),
                    ("Jardin",                 "Jardin",                 "extérieur"),
                    ("Frigo / Congélateur",    "Frigo",                  "intérieur"),
                    ("Poubelle",               "Poubelle",               "fonctionnement"),
                ]:
                    val = data.get(cat_key, "")
                    if val:
                        # Éviter les doublons
                        if not any(e["Nom"] == cat_label for e in data["Equipements"]):
                            data["Equipements"].append({
                                "Nom":        cat_label,
                                "Description": val,
                                "Categorie":  cat_mapped,
                            })

            # ── ISSUES ────────────────────────────────────────
            if current_section == "ISSUES":
                for row in item:
                    if len(row) < 3:
                        continue
                    if row[0].strip().lower() in ("catégorie", "categorie", ""):
                        continue
                    data["Issues"].append({
                        "Categorie": row[0].strip(),
                        "Probleme":  row[1].strip(),
                        "Statut":    row[2].strip(),
                    })

    data["Instruction Acces Externe"] = "\n".join(_acces_ext_lines).strip()

    # Instruction accès interne = portail + boîte + alarme
    _internal = []
    if data["Portail"]:    _internal.append("PORTAIL\n" + data["Portail"])
    if data["Boite Cle"]:  _internal.append("BOÎTE À CLÉS\n" + data["Boite Cle"])
    if data["Alarme Interne"]: _internal.append("ALARME\n" + data["Alarme Interne"])
    data["Instruction Acces Interne"] = "\n\n".join(_internal)

    return data


HEADERS = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {TOKEN}",
}

def urls_to_attachments(urls: list) -> list:
    return [{"url": url} for url in urls if url and isinstance(url, str)]


def _url(table: str) -> str:
    return f"https://api.airtable.com/v0/{BASE_ID}/{requests.utils.quote(table)}"


def airtable_create(table: str, fields: dict) -> str | None:
    resp = requests.post(_url(table), headers=HEADERS, json={"fields": fields})
    if resp.status_code in (200, 201):
        rid = resp.json().get("id")
        print(f"   ✅ [{table}] créé → {rid}")
        return rid
    print(f"   ❌ [{table}] erreur {resp.status_code} : {resp.text}")
    return None


def find_equivalent_description_interne_equipement(objet, all_equip):
    for e in all_equip:
        parts  = e["Nom"].split("—", 1)
        nom    = parts[0].strip()

        if objet in nom or objet in e["Description"]:
            return (e["Description"], parts[1].strip() if len(parts) > 1 else e.get("Checklist", ""))

    return ("", "")

def airtable_create_batch(table: str, records: list) -> list:
    url = _url(table)
    ids = []
    for i in range(0, len(records), 10):
        batch = [{"fields": r} for r in records[i:i + 10]]
        resp  = requests.post(url, headers=HEADERS, json={"records": batch})
        if resp.status_code in (200, 201):
            batch_ids = [r["id"] for r in resp.json().get("records", [])]
            ids.extend(batch_ids)
            print(f"  [{table}] batch {i // 10 + 1} → {len(batch_ids)} enregistrement(s)")
        else:
            print(f"   [{table}] batch {i // 10 + 1} erreur {resp.status_code} : {resp.text}")
    return ids


def airtable_patch(table: str, record_id: str, fields: dict):
    url  = f"{_url(table)}/{record_id}"
    resp = requests.patch(url, headers=HEADERS, json={"fields": fields})
    if resp.status_code == 200:
        print(f"  [{table}] {record_id} mis à jour")
    else:
        print(f"   [{table}] patch {record_id} erreur {resp.status_code} : {resp.text}")


def airtable_find_by_field(table: str, field: str, value: str) -> str | None:
    """Cherche un enregistrement par valeur de champ, retourne son ID."""
    params = {"filterByFormula": f"{{{field}}}='{value}'"}
    resp   = requests.get(_url(table), headers=HEADERS, params=params)
    if resp.status_code == 200:
        records = resp.json().get("records", [])
        if records:
            return records[0]["id"]
    return None


def find_or_create_proprietaire(data: dict) -> str | None:
    email = data["Proprietaire Email"]
    if email:
        rid = airtable_find_by_field(T_PROPRIETAIRE, "Email", email)
        if rid:
            print(f"   ℹ️  [Proprietaire] déjà existant → {rid}")
            return rid
    return airtable_create(T_PROPRIETAIRE, {
        "Nom":                data["Proprietaire Nom"],
        "Prenom":             data["Proprietaire Prenom"],
        "Email":              data["Proprietaire Email"],

        "Telephone":          data["Proprietaire Tel"],
        "Groupe whatsapp":    data["Whatsapp"],
        "InfoComplémentaire": data["Proprietaire Info"],
    })


def replace_beacons_in_text(text: str, images: list, videos: list) -> str:
    """
    Replace $i0, $i1... and $v0, $v1... placeholders with actual URLs.
    YouTube videos are kept as URLs (for JS to detect), non-YouTube videos too.
    """
    if not text:
        return ""

    result = text

    for idx, img_url in enumerate(images):
        beacon = f"$i{idx}"
        if beacon in result:
            result = result.replace(beacon, f"\n{img_url}\n")

    for idx, vid_url in enumerate(videos):
        beacon = f"$v{idx}"
        if beacon in result:
            result = result.replace(beacon, f"\n{vid_url}\n")

    result = re.sub(r'\n{3,}', '\n\n', result)
    return result.strip()


def parse_pool_section(text: str, keyword: str) -> str:
    """Extract section text after a keyword (fermeture/ouverture)"""
    pattern = rf'(?:{keyword}|{keyword.capitalize()})\s*[:\-]?\s*(.+?)(?:\n\n|ouverture|Ouverture|fermeture|Fermeture|$)'
    match = re.search(pattern, text, re.DOTALL | re.I)
    return match.group(1).strip() if match else ""


def format_youtube_field(videos: list) -> str | list:
    """
    Format YouTube videos for Airtable.
    Returns first URL as string (for text/URL fields).
    Modify to return list if your field supports multiple values.
    """
    yt_urls = [v for v in videos if "youtube" in v.lower()]
    return yt_urls[0] if yt_urls else None

def parse_pool_instructions(raw_text: str) -> dict:
    """Robust parser for pool instructions with NO.1/NO.2 format"""
    sections = {"ouverture": "", "fermeture": "", "entretien": ""}

    if not raw_text:
        return sections

    # Split by common section markers
    # Pattern: "NO.1", "NO.2", "Ouverture:", "Fermeture:", etc.
    parts = re.split(r'\n\s*(?:NO\.\d+|Ouverture|Fermeture|Entretien|Opening|Closing)\s*[:\-]?\s*', raw_text, flags=re.IGNORECASE)
    markers = re.findall(r'\n\s*(NO\.\d+|Ouverture|Fermeture|Entretien|Opening|Closing)\s*[:\-]?\s*', raw_text, flags=re.IGNORECASE)

    current_section = "ouverture"  # default
    for i, part in enumerate(parts):
        if not part.strip():
            continue
        if i > 0 and markers:
            marker = markers[i-1].lower()
            if any(k in marker for k in ["ferm", "close", "no.2"]):
                current_section = "fermeture"
            elif any(k in marker for k in ["entret", "clean", "ph"]):
                current_section = "entretien"
            else:
                current_section = "ouverture"

        if sections[current_section]:
            sections[current_section] += "\n" + part.strip()
        else:
            sections[current_section] = part.strip()

    return {k: v.strip() for k, v in sections.items() if v.strip()}

def send_to_airtable(data: dict, id_maison: str, dry_run: bool = False):
    print("\n📦 Préparation de l'envoi vers Airtable…\n")
    data_breezeway = clean_extracted_data(extract_data_breezeway())
    if dry_run:
        print("   Mode dry-run : aucune donnée ne sera envoyée.\n")
        return

    print("1️⃣  Propriétaire")
    prop_id = find_or_create_proprietaire(data)


    point_attention = ""
    list_media = []  # For Photos attachment field
    list_youtube = []  # For separate YouTube tracking if needed

    for title, content in data_breezeway.get("point_attention", {}).items():
        point_attention += f"<h1>{title}</h1>\n{content.get('text', '')}\n"

        images = content.get("images", []) or []
        videos = content.get("videos", []) or []

        list_media.extend([img for img in images if isinstance(img, str)])

        # ✅ Collect YouTube URLs separately
        youtube_urls = [v for v in videos if isinstance(v, str) and "youtube" in v.lower()]
        list_youtube.extend(youtube_urls)


    print("\nMaison")
    maison_fields = {
        "IdMaison":                  id_maison,
        "Adresse":                   data_breezeway["adresse"],
        "Description":               data_breezeway["html_bienvenue"],
        "AlarmeInterne":             data["Alarme Interne"],
        "InstructionAccesInterne":   data["Instruction Acces Interne"],
        "InstructionAccesExterne":   data["Instruction Acces Externe"],
        "InstructionDepart":      data_breezeway["instruction_depart"]["text"],
        "ResponsableCentrale": getDataExcel(id_maison)[0],

        "Proprietaire": [prop_id],
        "ResponsableZone" : getDataExcel(id_maison)[1],
        "Portail":                   data["Portail"],
        "BoiteCle":                  data["Boite Cle"],
        "JeuxDeCle":               data["Jeux de Cle"],
        "mediaPointAttention":       urls_to_attachments(list_media),
        "PointAttention":           point_attention
    }

    maison_id = airtable_create(T_MAISON, maison_fields)

    print("\n ListeMaison")
    liste_maison_fields = {
        "IdMaison":                  id_maison,
        "NomMaison":              data["Nom Maison"],
        "ImageMaison":           urls_to_attachments([data_breezeway["photo_maison"]]),
    }
    liste_maison = airtable_create(T_LISTEMAISON, liste_maison_fields)


    print("\n  Livret")
    livret_fields = {"IdMaison": data["Nom Maison"]}
    if maison_id:
        livret_fields["Maison"] = [maison_id]
    livret_id = airtable_create(T_LIVRET, livret_fields)
    if maison_id and livret_id:
        airtable_patch(T_MAISON, maison_id, {"Livret": [livret_id]})

    print("\n  Piscine")
    raw_pool_text = data.get("Piscine Raw", "")

    desinfectant = (["Sel"] if "sel" in raw_pool_text.lower()
                    else ["Chlore"] if "chlore" in raw_pool_text.lower()
                    else ["Autre"])

    # Extract Chauffage
    chauffage = []
    if any(w in raw_pool_text.lower() for w in ["chauffée", "pompe à chaleur", "pac ", "chauffage"]):
        chauffage.append("Pompe à chaleur")

    ouv_data = data_breezeway.get("instruction_ouverture_piscine", {})
    ferm_data = data_breezeway.get("instruction_fermeture_piscine", {})

    ouv_text = ouv_data.get("text", "") if isinstance(ouv_data, dict) else ""
    ferm_text = ferm_data.get("text", "") if isinstance(ferm_data, dict) else ""

    pool_images = []
    if isinstance(ouv_data, dict):
        pool_images.extend(ouv_data.get("images", []) or [])
    if isinstance(ferm_data, dict):
        pool_images.extend(ferm_data.get("images", []) or [])

    # Also check extérieur equipment for pool-related items
    ext_equip = data_breezeway.get("equippement_extérieur", {})
    for key, val in ext_equip.items():
        if "piscine" in key.lower() or "pool" in key.lower():
            if isinstance(val, dict):
                pool_images.extend(val.get("images", []) or [])

    # Remove duplicates while preserving order
    pool_images = list(dict.fromkeys([img for img in pool_images if isinstance(img, str)]))

    piscine_fields = {
        "Name": data["Nom Maison"] + " - Piscine",
        "InstructionOuverture": ouv_text,
        "InstructionFermeture": ferm_text,
        "InstructionEntretien": raw_pool_text or ouv_text,
        "InstructionClient": raw_pool_text,
        "Desinfectant": desinfectant,
    }

    if chauffage:
        piscine_fields["Chauffage"] = chauffage

    if pool_images:
        piscine_fields["Photo"] = urls_to_attachments(pool_images)

    pool_videos = []
    if isinstance(ouv_data, dict):
        pool_videos.extend(ouv_data.get("videos", []) or [])
    if isinstance(ferm_data, dict):
        pool_videos.extend(ferm_data.get("videos", []) or [])

    youtube_pool_urls = [v for v in pool_videos if isinstance(v, str) and "youtube" in v.lower()]
    if youtube_pool_urls:
        piscine_fields["lienYoutube"] = youtube_pool_urls[0]

    # Detect product type
    if "auto" in raw_pool_text.lower() or "régulé" in raw_pool_text.lower():
        piscine_fields["Produit"] = ["Autogérée"]
    elif "manuelle" in raw_pool_text.lower() or "manuel" in raw_pool_text.lower():
        piscine_fields["Produit"] = ["manuelle"]

    if maison_id:
        piscine_fields["Maison"] = [maison_id]
    piscine_id = airtable_create(T_PISCINE, piscine_fields)
    if maison_id and piscine_id:
        airtable_patch(T_MAISON, maison_id, {"Piscine": [piscine_id]})


    if data["Pieces"]:
        print(f"\n  DispositionMaison ({len(data['Pieces'])} pièces)")
        pieces_records = []
        for p in data["Pieces"]:
            rec = {
                "Piece":       p["Piece"],
                "Description": p["Description"],
                "Etage":       p["Etage"],
                "type":        p["Type"],
            }
            if maison_id:
                rec["Maison"] = [maison_id]
            pieces_records.append(rec)
        airtable_create_batch(T_DISPOSITION, pieces_records)
    else: