learn/ingest/ingest.py at master · netdata/learn · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
This is the script that gathers markdown files from all of Netdata's repos in this repo

Stages of this ingest script:

    Stage_1: Ingest every available markdown from the defaultRepos

    Stage_2: We create three buckets:
        1. all_markdown_files: all the markdown files in defaultRepos
        2. markdown_files_with_metadata: all the markdown files that have hidden metadata fields
        3. toPublish: markdown files that must be included in the learn
            (metadata_key_value: "learn_status": "Published")

    Stage_3:
        1. Move the toPublish markdown files under the DOCS_PREFIX folder based on their metadata (they decide where,
            they live)
        2. Generate autogenerated pages

    Stage_4: Sanitization
        1. Make the hidden metadata fields actual readable metadata for docusaurus

    Stage_5: Convert GH links to version specific links
"""

# Imports
import argparse
import ast
import errno
import glob
import io
import os
import re
import shutil
import urllib.parse
from pathlib import Path, PurePosixPath
import json
import xml.etree.ElementTree as ET
import git
import numpy as np
import pandas as pd
import requests
import yaml
from jsonschema import Draft7Validator

try:
    from PIL import Image
except Exception:
    Image = None

import autogenerateRedirects as genRedirects

DRY_RUN = False
DEBUG = False
DOCS_PREFIX = "will be added by arguments"

rest_files_dictionary = {}
rest_files_with_metadata_dictionary = {}
to_publish = {}
all_markdown_files = []
UNCORRELATED_LINK_COUNTER = 0
# Dict mapping repo name to dict of broken URLs -> set of source files
UNCORRELATED_URLS_BY_REPO = {}
# Dict mapping repo name to dict of (url, header) -> set of source files
BROKEN_HEADER_LINKS_BY_REPO = {}
FAIL_ON_REPOS = set()  # Set of repo names to fail on if broken links found
FAIL_ON_ALL_BROKEN_LINKS = False  # If True, fail on any broken link regardless of repo
IGNORE_ON_PREM_REPO = (
    False  # If True, skip cloning on-prem repo and ignore links pointing to it
)
LOCAL_REPOS = {}  # Dict mapping repo name to local path (use local dir instead of cloning)
# Temporarily until we release (change it (the default) to /docs
# version_prefix = "nightly"  # We use this as the version prefix in the link strategy
TEMP_FOLDER = "ingest-temp-folder"
default_repos = {
    "netdata": {
        "owner": "netdata",
        "branch": "master",
        "HEAD": "master",
    },
    "netdata-cloud-onprem": {
        "owner": "netdata",
        "branch": "master",
        "HEAD": "master",
    },
    ".github": {
        "owner": "netdata",
        "branch": "main",
        "HEAD": "main",
    },
    "agent-service-discovery": {
        "owner": "netdata",
        "branch": "master",
        "HEAD": "master",
    },
    "netdata-grafana-datasource-plugin": {
        "owner": "netdata",
        "branch": "master",
        "HEAD": "master",
    },
    "helmchart": {
        "owner": "netdata",
        "branch": "master",
        "HEAD": "master",
    },
}


# Regex to extract sidebar_position from frontmatter (supports double or single quotes)
POS_RE = re.compile(r'(?m)^\s*sidebar_position\s*:\s*["\']?(\d+)["\']?\s*$')
SIDEBAR_LABEL_RE = re.compile(r'(?m)^\s*sidebar_label\s*:\s*["\']([^"\']+)["\']\s*$')

# Marker string in auto-generated integration files
INTEGRATION_MARKER = "DO NOT EDIT THIS FILE DIRECTLY"
MAP_SIDEBAR_ORDER = {}
MAP_DOC_SCOPE = {}
MAP_SCHEMA_EXIT_CODE = 2

LOGO_ANALYSIS_TIMEOUT = 8
LOGO_RASTER_SAMPLE_SIZE = 64
LOGO_ALPHA_MIN = 16
LOGO_CONTRAST_THRESHOLD = 2.1
LIGHT_THEME_BG_RGB = (0xFD / 255.0, 0xFD / 255.0, 0xFD / 255.0)
DARK_THEME_BG_RGB = (0.0, 0.0, 0.0)
LOGO_CACHE = {}
LOGO_ANALYSIS_SUMMARY = {
    "analyzed": 0,
    "low_dark": 0,
    "low_light": 0,
    "unknown": 0,
}

MAP_COLUMNS = [
    "custom_edit_url",
    "sidebar_label",
    "learn_status",
    "learn_rel_path",
    "keywords",
    "description",
]


def format_map_schema_error(error):
    path = (
        ".".join(str(p) for p in error.absolute_path) if error.absolute_path else "root"
    )
    return f"[{path}] {error.message}"


def validate_map_schema(map_path, schema_path):
    with open(map_path, "r", encoding="utf-8") as map_file:
        map_data = yaml.safe_load(map_file) or {}

    with open(schema_path, "r", encoding="utf-8") as schema_file:
        schema = json.load(schema_file)

    validator = Draft7Validator(schema)
    errors = [
        format_map_schema_error(error) for error in validator.iter_errors(map_data)
    ]
    return len(errors) == 0, errors


def _normalize_placeholder_kind(kind):
    """Normalize integration placeholder kind to '<kind>_integrations'."""
    if not kind:
        return None
    if kind.endswith("_integrations"):
        return kind
    return f"{kind}_integrations"


def load_map_yaml(map_path):
    """Load simplified map.yaml and return ingest rows as a DataFrame.

    Rules:
    - Document rows are emitted only for nodes with non-empty `meta.edit_url`.
    - Placeholder rows are emitted for `integration_placeholder` entries.
    - `learn_rel_path` is reconstructed from hierarchy and optional `meta.path`.
    - `learn_status` is set to "Published" for emitted document rows.
    """
    with open(map_path, "r", encoding="utf-8") as fh:
        data = yaml.safe_load(fh) or {}

    sidebar = data.get("sidebar")
    if not isinstance(sidebar, list):
        raise ValueError("map.yaml must contain a top-level 'sidebar' list.")

    rows = []

    def join_path(parent_path, segment):
        if not segment:
            return parent_path
        if parent_path == "root":
            return segment
        return f"{parent_path}/{segment}"

    def walk(nodes, parent_path="root"):
        for node in nodes:
            if not node:
                continue
            if isinstance(node, dict) and node.get("type") == "integration_placeholder":
                kind = _normalize_placeholder_kind(node.get("integration_kind"))
                if kind:
                    rows.append(
                        {
                            "custom_edit_url": kind,
                            "sidebar_label": None,
                            "learn_status": None,
                            "learn_rel_path": None,
                            "keywords": None,
                            "description": None,
                        }
                    )
                continue

            if not isinstance(node, dict):
                continue

            meta = node.get("meta")
            if not isinstance(meta, dict):
                continue

            label = meta.get("label")
            has_items = isinstance(node.get("items"), list)
            explicit_path = meta.get("path")

            if has_items:
                segment = explicit_path if explicit_path else label
                current_path = join_path(parent_path, segment)
            else:
                if explicit_path:
                    current_path = join_path(parent_path, explicit_path)
                elif parent_path == "root":
                    current_path = "root"
                else:
                    current_path = parent_path

            edit_url = meta.get("edit_url")
            if isinstance(edit_url, str) and edit_url.strip():
                rows.append(
                    {
                        "custom_edit_url": edit_url,
                        "sidebar_label": label,
                        "learn_status": "Published",
                        "learn_rel_path": current_path,
                        "keywords": meta.get("keywords"),
                        "description": meta.get("description"),
                    }
                )

            if has_items:
                walk(node.get("items"), current_path)

    walk(sidebar)

    return pd.DataFrame(rows, columns=MAP_COLUMNS)


def load_map_sidebar_order(map_path):
    """Build sibling ordering map from map.yaml traversal order.

    Returns:
    - order_map: dict keyed by (parent_path, child_name) with sidebar positions
      assigned as 10, 20, 30, ... within each parent scope.
    - doc_scope_map: dict keyed by edit_url with resolved (parent_path, child_name)
      for rows originating from map.yaml documents.
    """
    with open(map_path, "r", encoding="utf-8") as fh:
        data = yaml.safe_load(fh) or {}

    sidebar = data.get("sidebar")
    if not isinstance(sidebar, list):
        return {}

    order = {}
    doc_scope = {}

    def join_path(parent_path, segment):
        if not segment:
            return parent_path
        if parent_path == "root":
            return segment
        return f"{parent_path}/{segment}"

    def walk(nodes, parent_parts):
        sibling_index = 0
        for node in nodes:
            if not isinstance(node, dict):
                continue
            if node.get("type") == "integration_placeholder":
                continue

            meta = node.get("meta") or {}
            label = meta.get("label")
            if not isinstance(label, str) or not label.strip():
                continue

            has_items = isinstance(node.get("items"), list)
            explicit_path = meta.get("path")

            if has_items:
                child_name = explicit_path if explicit_path else label
            else:
                if explicit_path:
                    child_name = explicit_path
                else:
                    child_name = label

            if not isinstance(child_name, str) or not child_name.strip():
                continue

            sibling_index += 1
            parent_key = "/".join(parent_parts) if parent_parts else "root"
            order.setdefault((parent_key, child_name), sibling_index * 10)

            edit_url = meta.get("edit_url")
            if isinstance(edit_url, str) and edit_url.strip():
                if has_items:
                    doc_scope[edit_url] = (parent_key, child_name)
                else:
                    if explicit_path:
                        leaf_parent = join_path(parent_key, child_name)
                    elif parent_key == "root":
                        leaf_parent = "root"
                    else:
                        leaf_parent = parent_key
                    doc_scope[edit_url] = (leaf_parent, label)

            if has_items:
                walk(node.get("items"), parent_parts + [child_name])

    walk(sidebar, [])
    return order, doc_scope


def ensure_category_json_for_dirs(docs_root):
    """
    For every directory under docs_root:
    - if <dir>/<basename(dir)>.mdx exists => treat as category overview; do nothing
    - else create or overwrite _category_.json using map-derived sibling order
    """
    for dirpath, dirnames, filenames in os.walk(docs_root):
        abs_dir = os.path.abspath(dirpath)
        abs_root = os.path.abspath(docs_root)

        base = os.path.basename(os.path.normpath(dirpath))

        if abs_dir == abs_root:
            continue

        category_json = os.path.join(dirpath, "_category_.json")

        if os.path.exists(os.path.join(dirpath, f"{base}.mdx")):
            continue

        mdx_files = [f for f in filenames if f.lower().endswith(".mdx")]
        if not mdx_files:
            # Do not create category files for stale empty directories.
            continue

        rel_parts = Path(dirpath).relative_to(Path(docs_root)).parts
        parent_key = "/".join(rel_parts[:-1]) if len(rel_parts) > 1 else "root"
        child_key = rel_parts[-1] if rel_parts else base

        map_pos = MAP_SIDEBAR_ORDER.get((parent_key, child_key))
        if map_pos is not None:
            cat_pos = map_pos
        else:
            # Fallback: use alphabetical-last bucket for unmapped categories
            cat_pos = 9999

        payload = {"label": base, "position": cat_pos}

        try:
            with open(category_json, "w", encoding="utf-8") as f:
                json.dump(payload, f, ensure_ascii=False, indent=2)
                f.write("\n")
        except OSError as e:
            print(f"WARNING: Failed to write category JSON '{category_json}': {e}")
            continue

        print(f"CREATE {category_json} with position={cat_pos} label='{base}'")


def _read_sidebar_label(path_to_file):
    """Read sidebar_label from frontmatter; fallback to file stem."""
    try:
        content = Path(path_to_file).read_text(encoding="utf-8")
    except OSError:
        return Path(path_to_file).stem

    match = SIDEBAR_LABEL_RE.search(content)
    if match:
        return match.group(1).strip()
    return Path(path_to_file).stem


def _set_sidebar_position(path_to_file, sidebar_position):
    """Upsert sidebar_position in frontmatter."""
    path_obj = Path(path_to_file)
    content = path_obj.read_text(encoding="utf-8")

    if POS_RE.search(content):
        updated = POS_RE.sub(
            f'sidebar_position: "{sidebar_position}"', content, count=1
        )
        path_obj.write_text(updated, encoding="utf-8")
        return

    lines = content.splitlines()
    if lines and lines[0].strip() == "---":
        lines.insert(1, f'sidebar_position: "{sidebar_position}"')
        path_obj.write_text("\n".join(lines) + "\n", encoding="utf-8")


def _set_category_position(path_to_file, label, sidebar_position):
    """Upsert position in _category_.json preserving label."""
    path_obj = Path(path_to_file)
    payload = {"label": label, "position": sidebar_position}
    path_obj.write_text(
        json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
    )


def normalize_sidebar_positions_by_parent(docs_root):
    """Assign unique sibling positions per parent scope.

    Uses map-derived sibling rank when available, then falls back to alphabetical
    ordering for injected/unknown siblings.
    """

    for dirpath, dirnames, filenames in os.walk(docs_root):
        rel_parts = Path(dirpath).relative_to(Path(docs_root)).parts
        parent_key = "/".join(rel_parts) if rel_parts else "root"

        entries = []

        # Direct child docs of this scope (exclude overview page for this scope)
        for fn in sorted(filenames):
            if not fn.lower().endswith(".mdx"):
                continue

            file_path = Path(dirpath) / fn
            if rel_parts and file_path.stem == rel_parts[-1]:
                continue

            label = _read_sidebar_label(file_path)
            entries.append(
                {
                    "path": str(file_path),
                    "child_name": label,
                    "display_label": label,
                    "kind": "doc",
                }
            )

        # Child directories represented by overview pages <child>/<child>.mdx
        for child_dir in sorted(dirnames):
            overview = Path(dirpath) / child_dir / f"{child_dir}.mdx"
            category_json = Path(dirpath) / child_dir / "_category_.json"
            if overview.exists():
                label = _read_sidebar_label(overview)
                entries.append(
                    {
                        "path": str(overview),
                        "child_name": child_dir,
                        "display_label": label,
                        "kind": "doc",
                    }
                )
            elif category_json.exists():
                try:
                    payload = json.loads(category_json.read_text(encoding="utf-8"))
                    label = str(payload.get("label") or child_dir)
                except Exception:
                    label = child_dir
                entries.append(
                    {
                        "path": str(category_json),
                        "child_name": child_dir,
                        "display_label": label,
                        "kind": "category",
                    }
                )

        if not entries:
            continue

        ordered_entries = sorted(
            entries,
            key=lambda item: (
                0
                if parent_key == "root"
                and item["display_label"].strip().lower() == "ask nedi"
                else 1,
                0
                if MAP_SIDEBAR_ORDER.get((parent_key, item["child_name"])) is not None
                else 1,
                MAP_SIDEBAR_ORDER.get((parent_key, item["child_name"]))
                if MAP_SIDEBAR_ORDER.get((parent_key, item["child_name"])) is not None
                else 0,
                item["display_label"].lower(),
            ),
        )

        for index, item in enumerate(ordered_entries, start=1):
            position_value = index * 10
            if (
                parent_key == "root"
                and item["display_label"].strip().lower() == "ask nedi"
            ):
                position_value = 0

            if item.get("kind") == "category":
                _set_category_position(
                    item["path"], item["display_label"], position_value
                )
            else:
                _set_sidebar_position(item["path"], position_value)


def clean_and_lower_string(string):
    """Normalize text for slug-like sorting and URL path segments."""
    return re.sub(
        r"(-)+",
        "-",
        string.lower().replace(",", "-").replace(" ", "-").replace("//", "/"),
    )


def extract_headers_from_file(file_path):
    """
    Extract all headers from a markdown file and return them as a set of anchor IDs.
    Headers are converted to anchor format: lowercase, spaces to hyphens, special chars removed.
    """
    headers = set()
    try:
        content = Path(file_path).read_text()
        # Match markdown headers (# Header, ## Header, etc.)
        header_pattern = r"^#{1,6}\s+(.+)$"
        for match in re.finditer(header_pattern, content, re.MULTILINE):
            header_text = match.group(1).strip()
            # Convert header to anchor ID (similar to how markdown processors do it)
            # Remove inline code backticks, bold/italic markers
            anchor = re.sub(r"[`*_]", "", header_text)
            # Remove HTML tags
            anchor = re.sub(r"<[^>]+>", "", anchor)
            # Convert to lowercase, replace spaces with hyphens
            anchor = anchor.lower().replace(" ", "-")
            # Remove special characters except hyphens
            anchor = re.sub(r"[^a-z0-9-]", "", anchor)
            # Remove multiple consecutive hyphens
            anchor = re.sub(r"-+", "-", anchor)
            # Remove leading/trailing hyphens
            anchor = anchor.strip("-")
            if anchor:
                headers.add(anchor)
    except Exception as e:
        pass
    return headers


def validate_header_in_file(file_path, header):
    """
    Check if a header/anchor exists in the target file.
    Returns True if the header exists or if header is empty, False otherwise.
    """
    if not header:
        return True
    headers = extract_headers_from_file(file_path)
    # Also check the raw header (some anchors are preserved as-is)
    return header.lower() in headers or header in headers


def extract_repo_from_github_url(url):
    """
    Extract the repository name from a GitHub URL.

    Example:
        https://github.com/netdata/netdata/blob/master/src/file.md -> netdata
        https://github.com/netdata/helmchart/blob/master/README.md -> helmchart
    """
    if not url.startswith("https://github.com/netdata/"):
        return "unknown"

    # URL format: https://github.com/netdata/<repo>/...
    parts = url.replace("https://github.com/netdata/", "").split("/")
    if parts:
        return parts[0]
    return "unknown"


def extract_repo_from_local_path(path):
    """
    Extract the repository name from a local path in the temp folder.

    Example:
        ingest-temp-folder/netdata/src/file.md -> netdata
        docs/something/file.mdx -> unknown (not from temp folder)
    """
    if path.startswith(TEMP_FOLDER + "/"):
        parts = path.replace(TEMP_FOLDER + "/", "").split("/")
        if parts:
            return parts[0]
    return "unknown"


def add_broken_url(repo, url, source_file):
    """Add a broken URL to the tracking dictionary, categorized by repo."""
    global UNCORRELATED_URLS_BY_REPO

    # If ignoring on-prem repo, skip URLs that point to it
    if IGNORE_ON_PREM_REPO:
        target_repo = extract_repo_from_github_url(url)
        if target_repo == "netdata-cloud-onprem":
            return

    if repo not in UNCORRELATED_URLS_BY_REPO:
        UNCORRELATED_URLS_BY_REPO[repo] = {}
    if url not in UNCORRELATED_URLS_BY_REPO[repo]:
        UNCORRELATED_URLS_BY_REPO[repo][url] = set()
    UNCORRELATED_URLS_BY_REPO[repo][url].add(source_file)


def add_broken_header(repo, full_link, header, source_file):
    """Add a broken header link to the tracking dictionary, categorized by repo."""
    global BROKEN_HEADER_LINKS_BY_REPO

    # If ignoring on-prem repo, skip links that point to it
    if IGNORE_ON_PREM_REPO:
        target_repo = extract_repo_from_github_url(full_link)
        if target_repo == "netdata-cloud-onprem":
            return

    if repo not in BROKEN_HEADER_LINKS_BY_REPO:
        BROKEN_HEADER_LINKS_BY_REPO[repo] = {}
    key = (full_link, header)
    if key not in BROKEN_HEADER_LINKS_BY_REPO[repo]:
        BROKEN_HEADER_LINKS_BY_REPO[repo][key] = set()
    BROKEN_HEADER_LINKS_BY_REPO[repo][key].add(source_file)


def github_url_to_local_path(url):
    """
    Convert a GitHub URL to a local path in the temp folder.
    Returns the local path if it can be constructed, None otherwise.

    Example:
        https://github.com/netdata/netdata/blob/master/src/go/pkg/prometheus/selector/README.md
        -> ingest-temp-folder/netdata/src/go/pkg/prometheus/selector/README.md
    """
    if not url.startswith("https://github.com/netdata"):
        return None

    # Convert URL to local path
    local_path = url.replace("https://github.com/netdata", TEMP_FOLDER)
    local_path = local_path.replace("edit/", "blob/", 1)
    local_path = local_path.replace("blob/master/", "")
    local_path = local_path.replace("blob/main/", "")

    return local_path


def resolve_repo_relative_link(raw_link, source_file_path):
    """
    Resolve a local markdown link into:
      1) repo-root-relative path (e.g. src/go/plugin/README.md)
      2) local temp-folder path (e.g. ingest-temp-folder/netdata/src/go/plugin/README.md)

    Returns tuple(repo_relative_path, local_path) or (None, None) if not resolvable.
    """
    if not raw_link:
        return None, None

    repo_name = extract_repo_from_local_path(source_file_path)
    if repo_name == "unknown":
        return None, None

    repo_root = Path(TEMP_FOLDER) / repo_name
    source_path = Path(source_file_path)

    try:
        source_parent = source_path.relative_to(repo_root).parent
    except ValueError:
        return None, None

    link = raw_link.replace("\\", "/").strip()

    # Root-relative links are relative to repo root. Dot-relative links are from the source file directory.
    if link.startswith("/"):
        candidate = link.lstrip("/")
    elif link.startswith("."):
        parent_prefix = source_parent.as_posix()
        candidate = f"{parent_prefix}/{link}" if parent_prefix != "." else link
    else:
        return None, None

    normalized = PurePosixPath(candidate).as_posix()
    normalized = os.path.normpath(normalized).replace("\\", "/")

    while normalized.startswith("./"):
        normalized = normalized[2:]

    if normalized in ("", ".", "..") or normalized.startswith("../"):
        return None, None

    local_path = (repo_root / Path(normalized)).as_posix()
    return normalized, local_path


def file_exists_in_repos(url):
    """
    Check if a GitHub URL points to a file that exists in the cloned repos.
    Returns True if the file exists, False otherwise.
    """
    local_path = github_url_to_local_path(url)
    if local_path is None:
        return False
    return Path(local_path).exists()


def convert_parenthetical_slash(segment: str) -> str:
    """
    Convert occurrences like "(ABC/XYZ)" into "ABC-XYZ" inside a string.
    Only converts simple parenthetical groups containing a single slash.
    """
    if not segment:
        return segment

    # Replace occurrences of (A/B) or (A/B/C) with A-B or A-B-C respectively
    def repl(m):
        inner = m.group(1)
        parts = inner.split("/")
        return "-".join(parts)

    return re.sub(r"\(([^()]+?/[^()]+?)\)", repl, segment)


def populate_integrations(markdownFiles):
    """
    Populate integration placeholders in the map using integration metadata.

    Reads integration metadata from markdown files, builds category-specific
    integration entry sets, and replaces placeholder sentinel rows in the map.
    """

    print("### Populating map from Integration metadata rows ###\n")

    metadata_dictionary = {}
    ignore_dup = []

    # Read the map file, to replace the placeholder for the dynamic part
    map_file = load_map_yaml("map.yaml")

    collectors_entries = pd.DataFrame()
    live_functions_entries = pd.DataFrame()
    exporting_entries = pd.DataFrame()
    alerting_agent_entries = pd.DataFrame()
    alerting_cloud_entries = pd.DataFrame()
    authentication_entries = pd.DataFrame()
    secretstore_entries = pd.DataFrame()
    logs_entries = pd.DataFrame()

    readmes_first = []
    others_last = []
    for file in markdownFiles:
        if "README.md" in file:
            readmes_first.append(file)
        else:
            others_last.append(file)

    markdownFiles = readmes_first + others_last

    for file in markdownFiles:
        normalized_file = file.replace("\\", "/")
        path = file.split("integrations")[0].replace("README.md", "")

        whole_file = Path(file).read_text()

        if whole_file not in ignore_dup and INTEGRATION_MARKER in whole_file:
            meta = (
                whole_file.split("endmeta-->")[0].replace("<!--startmeta", "---")
                + "---"
            )

            metadata_dictionary = read_metadata(meta)

            if os.path.islink(file):
                ignore_dup.append(whole_file)
                # If it is a manual symlink, meaning a README symlink but the folder has more than one integration, thus their custom_edit_urls are unique. 1:1 integrations have the README link as custom_edit_url
                if (
                    not file.replace("ingest-temp-folder/", "").split("/", 1)[1]
                    in metadata_dictionary["custom_edit_url"]
                ):
                    proper_edit_url = file.replace("ingest-temp-folder/", "")

                    proper_edit_url = (
                        "https://github.com/netdata/"
                        + proper_edit_url.split("/", 1)[0]
                        + "/edit/master/"
                        + proper_edit_url.split("/", 1)[1]
                    )
                    metadata_dictionary["custom_edit_url"] = proper_edit_url

                    # print("path:", file)
                    # print(metadata_dictionary)

            metadf = pd.DataFrame([metadata_dictionary])
            # print(file)
            if "/integrations/functions/" in normalized_file:
                # Compatibility for generated function tiles: ensure
                # custom_edit_url is path-based so map lookup can match files.
                proper_edit_url = file.replace("ingest-temp-folder/", "")
                if "/" in proper_edit_url:
                    repo_name, repo_rel_path = proper_edit_url.split("/", 1)
                    branch_name = "main" if repo_name == ".github" else "master"
                    metadf["custom_edit_url"] = (
                        f"https://github.com/netdata/{repo_name}/edit/{branch_name}/{repo_rel_path}"
                    )

                live_functions_entries = pd.concat([live_functions_entries, metadf])
            elif "collector" in path:
                collectors_entries = pd.concat([collectors_entries, metadf])
                # print(collectors_entries)
                # quit()
            elif "/secretstore/backends/" in normalized_file:
                secretstore_entries = pd.concat([secretstore_entries, metadf])
            elif "exporting" in path:
                exporting_entries = pd.concat([exporting_entries, metadf])
                # print(exporting_entries)
            elif "cloud-authentication" in file:
                authentication_entries = pd.concat([authentication_entries, metadf])
            # here we need a different check, as the path variable gets messed up
            elif "cloud-notifications" in file:
                # print("in")
                alerting_cloud_entries = pd.concat([alerting_cloud_entries, metadf])
            elif "logs" in file:
                # Custom location for Logs integrations, as they normally have a pretty big README that we add as a reference, as a child to the integration's folder.
                metadf["learn_rel_path"] = (
                    metadf["learn_rel_path"] + "/" + metadf["sidebar_label"]
                )

                logs_entries = pd.concat([logs_entries, metadf])
            else:
                alerting_agent_entries = pd.concat([alerting_agent_entries, metadf])

    # print("Collectors\n", collectors_entries, "Agent alerts\n", alerting_agent, "Cloud alerts\n",  alerting_cloud, "Exporting",  exporting_entries)

    replace_index = map_file.loc[
        map_file["custom_edit_url"] == "authentication_integrations"
    ].index
    # print(replace_index[0])
    upper = map_file.iloc[: replace_index[0]]
    lower = map_file.iloc[replace_index[0] + 1 :]

    map_file = pd.concat(
        [
            upper,
            authentication_entries.sort_values(
                by=["learn_rel_path", "sidebar_label"], key=lambda col: col.str.lower()
            ),
            lower,
        ],
        ignore_index=True,
    )

    replace_index = map_file.loc[
        map_file["custom_edit_url"] == "collectors_integrations"
    ].index
    # print(replace_index[0])
    upper = map_file.iloc[: replace_index[0]]
    lower = map_file.iloc[replace_index[0] + 1 :]

    map_file = pd.concat(
        [
            upper,
            collectors_entries.sort_values(
                by=["learn_rel_path", "sidebar_label"], key=lambda col: col.str.lower()
            ),
            lower,
        ],
        ignore_index=True,
    )

    replace_index = map_file.loc[
        map_file["custom_edit_url"] == "secretstore_integrations"
    ].index
    upper = map_file.iloc[: replace_index[0]]
    lower = map_file.iloc[replace_index[0] + 1 :]

    map_file = pd.concat(
        [
            upper,
            secretstore_entries.sort_values(
                by=["learn_rel_path", "sidebar_label"], key=lambda col: col.str.lower()
            ),
            lower,
        ],
        ignore_index=True,
    )

    replace_index = map_file.loc[
        map_file["custom_edit_url"] == "live_functions_integrations"
    ].index
    if len(replace_index) > 0:
        upper = map_file.iloc[: replace_index[0]]
        lower = map_file.iloc[replace_index[0] + 1 :]

        map_file = pd.concat(
            [
                upper,
                live_functions_entries.sort_values(
                    by=["learn_rel_path", "sidebar_label"],
                    key=lambda col: col.str.lower(),
                ),
                lower,
            ],
            ignore_index=True,
        )

    replace_index = map_file.loc[
        map_file["custom_edit_url"] == "agent_notifications_integrations"
    ].index
    # print(replace_index[0])
    upper = map_file.iloc[: replace_index[0]]
    lower = map_file.iloc[replace_index[0] + 1 :]

    map_file = pd.concat(
        [
            upper,
            alerting_agent_entries.sort_values(
                by=["learn_rel_path", "sidebar_label"], key=lambda col: col.str.lower()
            ),
            lower,
        ],
        ignore_index=True,
    )

    replace_index = map_file.loc[
        map_file["custom_edit_url"] == "cloud_notifications_integrations"
    ].index
    upper = map_file.iloc[: replace_index[0]]
    lower = map_file.iloc[replace_index[0] + 1 :]

    map_file = pd.concat(
        [
            upper,
            alerting_cloud_entries.sort_values(
                by=["learn_rel_path", "sidebar_label"], key=lambda col: col.str.lower()
            ),
            lower,
        ],
        ignore_index=True,
    )

    replace_index = map_file.loc[
        map_file["custom_edit_url"] == "exporters_integrations"
    ].index
    # print(replace_index[0])
    upper = map_file.iloc[: replace_index[0]]
    lower = map_file.iloc[replace_index[0] + 1 :]

    map_file = pd.concat(
        [
            upper,
            exporting_entries.sort_values(
                by=["learn_rel_path", "sidebar_label"], key=lambda col: col.str.lower()
            ),
            lower,
        ],
        ignore_index=True,
    )

    replace_index = map_file.loc[
        map_file["custom_edit_url"] == "logs_integrations"
    ].index
    upper = map_file.iloc[: replace_index[0]]
    lower = map_file.iloc[replace_index[0] + 1 :]

    map_file = pd.concat(
        [
            upper,
            logs_entries.sort_values(
                by=["learn_rel_path", "sidebar_label"], key=lambda col: col.str.lower()
            ),
            lower,
        ],
        ignore_index=True,
    )

    # Convert DataFrame to list of dicts and save as YAML
    generated_map_data = map_file.to_dict(orient="records")
    with open("ingest/generated_map.yaml", "w", encoding="utf-8") as fh:
        yaml.dump(
            generated_map_data,
            fh,
            default_flow_style=False,
            allow_unicode=True,
            sort_keys=False,
        )