PabloVitasso
diff --git a/‎.gitignore‎
Lines changed: 72 additions & 0 deletions b/‎.gitignore‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎core/anonymizer.py‎
Lines changed: 98 additions & 0 deletions b/‎core/anonymizer.py‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎core/normalizer.py‎ b/‎core/normalizer.py‎
diff --git a/‎core/patterns.py‎
Lines changed: 17 additions & 0 deletions b/‎core/patterns.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎core/profile_config.py‎
Lines changed: 36 additions & 0 deletions b/‎core/profile_config.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎core/transforms.py‎
Lines changed: 56 additions & 0 deletions b/‎core/transforms.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎interfaces/cli.py‎
Lines changed: 43 additions & 0 deletions b/‎interfaces/cli.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎interfaces/file_io.py‎
Lines changed: 20 additions & 0 deletions b/‎interfaces/file_io.py‎
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,72 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Scrapy stuff:
+.scrapy
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# logs
+logs/
@@ -0,0 +1,98 @@
+from typing import NamedTuple, Dict, Set
+import spacy
+from .patterns import PATTERNS
+from .transforms import generalize_money, generalize_dates
+from .profile_config import PROFILES, DEFAULT_PROFILE
+from utils.log import logger
+
+class Entity(NamedTuple):
+    text: str
+    type: str
+    start: int
+    end: int
+
+# Ładowanie modelu spaCy
+try:
+    nlp = spacy.load("pl_core_news_lg")
+except OSError:
+    logger.error("Nie znaleziono modelu 'pl_core_news_lg'. Uruchom:\npython -m spacy download pl_core_news_lg")
+    nlp = None
+
+def safe_substitute(text: str, substitution_dict: dict) -> str:
+    sorted_keys = sorted(substitution_dict.keys(), key=len, reverse=True)
+    for original in sorted_keys:
+        replacement = substitution_dict[original]
+        text = text.replace(original, replacement)
+    return text
+
+def anonymize_text(text: str, profile: str = DEFAULT_PROFILE, custom_classes: list[str] = None) -> tuple[str, dict]:
+    profile_config = PROFILES.get(profile, PROFILES[DEFAULT_PROFILE])
+    enabled_classes = custom_classes if custom_classes is not None else profile_config["classes"]
+    transformations = profile_config.get("transformations", {})
+    
+    all_entities: list[Entity] = []
+    
+    # Krok 1: Znajdź wszystkie encje z REGEX i zapisz ich pozycje
+    logger.debug("--- Regex Entity Recognition ---")
+    matched_pos: Set[range] = set()
+
+    for class_name, pattern in PATTERNS:
+        for match in pattern.finditer(text):
+            start, end = match.span()
+            # Sprawdź, czy zakres się nie nakłada
+            if not any(start in r or end - 1 in r for r in matched_pos):
+                entity = Entity(match.group(0), class_name, start, end)
+                all_entities.append(entity)
+                matched_pos.add(range(start, end))
+                logger.debug(f"Regex found: {entity}")
+
+    # Krok 2: Znajdź encje z NER, ignorując te, które nakładają się na regex
+    logger.debug("--- NER Entity Recognition ---")
+    if nlp:
+        doc = nlp(text)
+        ner_map = {"persName": "PERSON", "placeName": "LOCATION", "orgName": "ORGANIZATION"}
+        for ent in doc.ents:
+            start, end = ent.start_char, ent.end_char
+            if not any(start in r or end - 1 in r for r in matched_pos):
+                entity_type = ner_map.get(ent.label_)
+                if entity_type:
+                    entity = Entity(ent.text.strip(), entity_type, start, end)
+                    all_entities.append(entity)
+                    matched_pos.add(range(start, end))
+                    logger.debug(f"NER found: {entity}")
+
+    # Krok 3: Sortuj wszystkie znalezione encje wg pozycji startowej
+    all_entities.sort(key=lambda x: x.start)
+    logger.debug(f"All entities sorted by position: {[e.text for e in all_entities]}")
+
+    # Krok 4: Zbuduj słownik zamian
+    substitution_dict: Dict[str, str] = {}
+    token_counters: Dict[str, int] = {}
+
+    for entity in all_entities:
+        if entity.type in enabled_classes:
+            transform_type = transformations.get(entity.type, "tokenize")
+            
+            if transform_type == "tokenize":
+                count = token_counters.get(entity.type, 0)
+                substitution_dict[entity.text] = f"<{entity.type}_{count}>"
+                token_counters[entity.type] = count + 1
+            elif transform_type == "remove":
+                substitution_dict[entity.text] = ""
+            elif transform_type == "generalize_loc":
+                 substitution_dict[entity.text] = "[MIASTO WOJEWÓDZKIE]"
+            elif transform_type == "generalize":
+                if entity.type == "MONEY":
+                    substitution_dict.update(generalize_money(entity.text))
+                elif entity.type == "DATE":
+                    substitution_dict.update(generalize_dates(entity.text))
+    
+    if profile == 'gdpr':
+        # Dodatkowe generalizacje dla GDPR
+        temp_text = safe_substitute(text, substitution_dict)
+        substitution_dict.update(generalize_dates(temp_text))
+        substitution_dict.update(generalize_money(temp_text))
+
+    anonymized_text = safe_substitute(text, substitution_dict)
+    token_map = {v: k for k, v in substitution_dict.items() if v.startswith('<')}
+    return anonymized_text, token_map
@@ -0,0 +1,17 @@
+import re
+
+# Wzorce Regex do identyfikacji danych wrażliwych
+
+# Zmieniono na listę krotek, aby zachować kolejność przetwarzania
+# Ważne: PESEL musi być przed REGON, aby uniknąć błędnego dopasowania
+PATTERNS = [
+    ("KW", re.compile(r"\b[A-Z]{2}[A-Z0-9]{2}/\d{8}/\d\b")),
+    ("PESEL", re.compile(r"\b\d{11}\b")),
+    ("NIP", re.compile(r"\b\d{3}[- ]?\d{3}[- ]?\d{2}[- ]?\d{2}\b")),
+    ("REGON", re.compile(r"\b\d{9,14}\b")),
+    # Poprawiony, prostszy wzorzec dla organizacji w cudzysłowach
+    ("ORGANIZATION", re.compile(r'"([^"]+ S\.A\.)"', re.IGNORECASE)),
+    ("MONEY", re.compile(r"(\d{1,3}(?:[ .]\d{3})*(?:,\d{2})?)\s?(z[łl])", re.IGNORECASE)),
+    ("DATE", re.compile(r"\b\d{1,2}\.\d{1,2}\.\d{4}\b")),
+    ("DATE", re.compile(r"\b\d{1,2} [a-zA-Z]+ \d{4} roku\b", re.IGNORECASE)),
+]
@@ -0,0 +1,36 @@
+# Definicje profili anonimizacji
+
+PROFILES = {
+    "pseudonymized": {
+        "description": "Pseudonimizacja: Zamiana encji na tagi, z możliwością odwrócenia (generuje mapę).",
+        "classes": ["PESEL", "NIP", "REGON", "KW", "PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
+        "transformations": {
+            "DATE": "tokenize",
+            "MONEY": "tokenize"
+        }
+    },
+    "gdpr": {
+        "description": "Anonimizacja RODO: Nieodwracalna generalizacja danych.",
+        "classes": ["PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
+        "transformations": {
+            "DATE": "generalize",
+            "MONEY": "generalize",
+            "PERSON": "remove",
+            "LOCATION": "generalize_loc",
+            "ORGANIZATION": "remove"
+        }
+    },
+    "llm-safe": {
+        "description": "Profil LLM-Safe: Maksymalna anonimizacja, zastępowanie semantyczne.",
+        "classes": ["PESEL", "NIP", "REGON", "KW", "PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
+        "transformations": {
+            "DATE": "generalize",
+            "MONEY": "generalize",
+            "PERSON": "replace_semantic",
+            "LOCATION": "replace_semantic",
+            "ORGANIZATION": "replace_semantic"
+        }
+    }
+}
+
+DEFAULT_PROFILE = "pseudonymized"
@@ -0,0 +1,56 @@
+import re
+
+def generalize_money(text: str) -> dict:
+    """
+    Generalizuje kwoty pieniężne, zastępując je przedziałami.
+    Przykład: 521 000,00 zł -> około 500-550 tys. PLN
+    """
+    # Prosty regex dla kwot, można go rozwijać
+    pattern = re.compile(r"(\d{1,3}(?:[ .]\d{3})*(?:,\d{2})?)\s?(z[łl])", re.IGNORECASE)
+    substitution_dict = {}
+    
+    for match in pattern.finditer(text):
+        original_string = match.group(0)
+        numeric_part = match.group(1).replace(" ", "").replace(".", "")
+        value = float(numeric_part.replace(",", "."))
+        
+        if value > 1_000_000:
+            rounded = round(value / 100_000) * 100_000
+            replacement = f"ponad {rounded / 1_000_000:.1f} mln PLN"
+        elif value > 10_000:
+            rounded = round(value / 10_000) * 10_000
+            replacement = f"około {rounded/1000:.0f} tys. PLN"
+        else:
+            replacement = "poniżej 10 tys. PLN"
+            
+        if original_string not in substitution_dict:
+            substitution_dict[original_string] = replacement
+            
+    return substitution_dict
+
+def generalize_dates(text: str) -> dict:
+    """
+    Generalizuje daty do kwartałów lub miesięcy.
+    Przykład: 12.03.2023 -> Q1 2023
+    """
+    pattern = re.compile(r"\b(\d{1,2})[./-](\d{1,2})[./-](\d{4})\b")
+    substitution_dict = {}
+    
+    for match in pattern.finditer(text):
+        original_string = match.group(0)
+        _, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
+        
+        if 1 <= month <= 3:
+            quarter = "Q1"
+        elif 4 <= month <= 6:
+            quarter = "Q2"
+        elif 7 <= month <= 9:
+            quarter = "Q3"
+        else:
+            quarter = "Q4"
+            
+        replacement = f"{quarter} {year}"
+        if original_string not in substitution_dict:
+            substitution_dict[original_string] = replacement
+            
+    return substitution_dict
@@ -0,0 +1,43 @@
+import argparse
+from core.anonymizer import anonymize_text
+from .file_io import read_file, write_file, save_map_dict
+from utils.log import setup_logger
+
+def main():
+    parser = argparse.ArgumentParser(description="Anonymize documents from the command line.")
+    parser.add_argument("--input", required=True, help="Path to the source file.")
+    parser.add_argument("--output", required=True, help="Path to the output anonymized file.")
+    parser.add_argument("--profile", default="pseudonymized", choices=["pseudonymized", "gdpr", "llm-safe"],
+                        help="Anonymization profile.")
+    
+    # Opcja --classes staje się opcjonalna - nadpisuje ustawienia z profilu
+    parser.add_argument("--classes", nargs='+', default=None,
+                        help="Custom list of classes to anonymize (overwrites profile setting).")
+    
+    args = parser.parse_args()
+
+    # Skonfiguruj logger z nazwą pliku wejściowego
+    logger = setup_logger(args.input)
+
+    # Odczyt pliku
+    original_text = read_file(args.input)
+    if not original_text:
+        return
+
+    # Anonimizacja
+    logger.info(f"Starting anonymization for {args.input} using profile: {args.profile}...")
+    anonymized_text, substitution_map = anonymize_text(original_text, args.profile, args.classes)
+
+    # Zapis wyników
+    write_file(args.output, anonymized_text)
+    
+    map_file_path = args.output.rsplit('.', 1)[0] + '_map.json'
+    if substitution_map:
+        save_map_dict(map_file_path, substitution_map)
+        logger.info(f"Anonymization complete. Output saved to: {args.output}")
+        logger.info(f"Substitution map saved to: {map_file_path}")
+    else:
+        logger.info("No tokenized entities found to create a map.")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,20 @@
+import json
+
+def read_file(file_path: str) -> str:
+    """Odczytuje zawartość pliku tekstowego."""
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    except FileNotFoundError:
+        print(f"Błąd: Plik nie został znaleziony: {file_path}")
+        return ""
+
+def write_file(file_path: str, content: str):
+    """Zapisuje zawartość do pliku tekstowego."""
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(content)
+
+def save_map_dict(file_path: str, substitution_dict: dict):
+    """Zapisuje słownik mapowań do pliku JSON."""
+    with open(file_path, "w", encoding="utf-8") as f:
+        json.dump(substitution_dict, f, ensure_ascii=False, indent=2)