Skip to content

Commit f764579

Browse files
committed
Initial commit
0 parents  commit f764579

19 files changed

+858
-0
lines changed

.gitignore

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
*.egg-info/
24+
.installed.cfg
25+
*.egg
26+
27+
# PyInstaller
28+
# Usually these files are written by a python script from a template
29+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
30+
*.manifest
31+
*.spec
32+
33+
# Installer logs
34+
pip-log.txt
35+
pip-delete-this-directory.txt
36+
37+
# Unit test / coverage reports
38+
htmlcov/
39+
.tox/
40+
.nox/
41+
.coverage
42+
.coverage.*
43+
.cache
44+
nosetests.xml
45+
coverage.xml
46+
*.cover
47+
*.py,cover
48+
.hypothesis/
49+
.pytest_cache/
50+
51+
# Translations
52+
*.mo
53+
*.pot
54+
55+
# Django stuff:
56+
*.log
57+
local_settings.py
58+
59+
# Scrapy stuff:
60+
.scrapy
61+
62+
# Environments
63+
.env
64+
.venv
65+
env/
66+
venv/
67+
ENV/
68+
env.bak/
69+
venv.bak/
70+
71+
# logs
72+
logs/

core/anonymizer.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from typing import NamedTuple, Dict, Set
2+
import spacy
3+
from .patterns import PATTERNS
4+
from .transforms import generalize_money, generalize_dates
5+
from .profile_config import PROFILES, DEFAULT_PROFILE
6+
from utils.log import logger
7+
8+
class Entity(NamedTuple):
9+
text: str
10+
type: str
11+
start: int
12+
end: int
13+
14+
# Ładowanie modelu spaCy
15+
try:
16+
nlp = spacy.load("pl_core_news_lg")
17+
except OSError:
18+
logger.error("Nie znaleziono modelu 'pl_core_news_lg'. Uruchom:\npython -m spacy download pl_core_news_lg")
19+
nlp = None
20+
21+
def safe_substitute(text: str, substitution_dict: dict) -> str:
22+
sorted_keys = sorted(substitution_dict.keys(), key=len, reverse=True)
23+
for original in sorted_keys:
24+
replacement = substitution_dict[original]
25+
text = text.replace(original, replacement)
26+
return text
27+
28+
def anonymize_text(text: str, profile: str = DEFAULT_PROFILE, custom_classes: list[str] = None) -> tuple[str, dict]:
29+
profile_config = PROFILES.get(profile, PROFILES[DEFAULT_PROFILE])
30+
enabled_classes = custom_classes if custom_classes is not None else profile_config["classes"]
31+
transformations = profile_config.get("transformations", {})
32+
33+
all_entities: list[Entity] = []
34+
35+
# Krok 1: Znajdź wszystkie encje z REGEX i zapisz ich pozycje
36+
logger.debug("--- Regex Entity Recognition ---")
37+
matched_pos: Set[range] = set()
38+
39+
for class_name, pattern in PATTERNS:
40+
for match in pattern.finditer(text):
41+
start, end = match.span()
42+
# Sprawdź, czy zakres się nie nakłada
43+
if not any(start in r or end - 1 in r for r in matched_pos):
44+
entity = Entity(match.group(0), class_name, start, end)
45+
all_entities.append(entity)
46+
matched_pos.add(range(start, end))
47+
logger.debug(f"Regex found: {entity}")
48+
49+
# Krok 2: Znajdź encje z NER, ignorując te, które nakładają się na regex
50+
logger.debug("--- NER Entity Recognition ---")
51+
if nlp:
52+
doc = nlp(text)
53+
ner_map = {"persName": "PERSON", "placeName": "LOCATION", "orgName": "ORGANIZATION"}
54+
for ent in doc.ents:
55+
start, end = ent.start_char, ent.end_char
56+
if not any(start in r or end - 1 in r for r in matched_pos):
57+
entity_type = ner_map.get(ent.label_)
58+
if entity_type:
59+
entity = Entity(ent.text.strip(), entity_type, start, end)
60+
all_entities.append(entity)
61+
matched_pos.add(range(start, end))
62+
logger.debug(f"NER found: {entity}")
63+
64+
# Krok 3: Sortuj wszystkie znalezione encje wg pozycji startowej
65+
all_entities.sort(key=lambda x: x.start)
66+
logger.debug(f"All entities sorted by position: {[e.text for e in all_entities]}")
67+
68+
# Krok 4: Zbuduj słownik zamian
69+
substitution_dict: Dict[str, str] = {}
70+
token_counters: Dict[str, int] = {}
71+
72+
for entity in all_entities:
73+
if entity.type in enabled_classes:
74+
transform_type = transformations.get(entity.type, "tokenize")
75+
76+
if transform_type == "tokenize":
77+
count = token_counters.get(entity.type, 0)
78+
substitution_dict[entity.text] = f"<{entity.type}_{count}>"
79+
token_counters[entity.type] = count + 1
80+
elif transform_type == "remove":
81+
substitution_dict[entity.text] = ""
82+
elif transform_type == "generalize_loc":
83+
substitution_dict[entity.text] = "[MIASTO WOJEWÓDZKIE]"
84+
elif transform_type == "generalize":
85+
if entity.type == "MONEY":
86+
substitution_dict.update(generalize_money(entity.text))
87+
elif entity.type == "DATE":
88+
substitution_dict.update(generalize_dates(entity.text))
89+
90+
if profile == 'gdpr':
91+
# Dodatkowe generalizacje dla GDPR
92+
temp_text = safe_substitute(text, substitution_dict)
93+
substitution_dict.update(generalize_dates(temp_text))
94+
substitution_dict.update(generalize_money(temp_text))
95+
96+
anonymized_text = safe_substitute(text, substitution_dict)
97+
token_map = {v: k for k, v in substitution_dict.items() if v.startswith('<')}
98+
return anonymized_text, token_map

core/normalizer.py

Whitespace-only changes.

core/patterns.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import re
2+
3+
# Wzorce Regex do identyfikacji danych wrażliwych
4+
5+
# Zmieniono na listę krotek, aby zachować kolejność przetwarzania
6+
# Ważne: PESEL musi być przed REGON, aby uniknąć błędnego dopasowania
7+
PATTERNS = [
8+
("KW", re.compile(r"\b[A-Z]{2}[A-Z0-9]{2}/\d{8}/\d\b")),
9+
("PESEL", re.compile(r"\b\d{11}\b")),
10+
("NIP", re.compile(r"\b\d{3}[- ]?\d{3}[- ]?\d{2}[- ]?\d{2}\b")),
11+
("REGON", re.compile(r"\b\d{9,14}\b")),
12+
# Poprawiony, prostszy wzorzec dla organizacji w cudzysłowach
13+
("ORGANIZATION", re.compile(r'"([^"]+ S\.A\.)"', re.IGNORECASE)),
14+
("MONEY", re.compile(r"(\d{1,3}(?:[ .]\d{3})*(?:,\d{2})?)\s?(z[łl])", re.IGNORECASE)),
15+
("DATE", re.compile(r"\b\d{1,2}\.\d{1,2}\.\d{4}\b")),
16+
("DATE", re.compile(r"\b\d{1,2} [a-zA-Z]+ \d{4} roku\b", re.IGNORECASE)),
17+
]

core/profile_config.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Definicje profili anonimizacji
2+
3+
PROFILES = {
4+
"pseudonymized": {
5+
"description": "Pseudonimizacja: Zamiana encji na tagi, z możliwością odwrócenia (generuje mapę).",
6+
"classes": ["PESEL", "NIP", "REGON", "KW", "PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
7+
"transformations": {
8+
"DATE": "tokenize",
9+
"MONEY": "tokenize"
10+
}
11+
},
12+
"gdpr": {
13+
"description": "Anonimizacja RODO: Nieodwracalna generalizacja danych.",
14+
"classes": ["PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
15+
"transformations": {
16+
"DATE": "generalize",
17+
"MONEY": "generalize",
18+
"PERSON": "remove",
19+
"LOCATION": "generalize_loc",
20+
"ORGANIZATION": "remove"
21+
}
22+
},
23+
"llm-safe": {
24+
"description": "Profil LLM-Safe: Maksymalna anonimizacja, zastępowanie semantyczne.",
25+
"classes": ["PESEL", "NIP", "REGON", "KW", "PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
26+
"transformations": {
27+
"DATE": "generalize",
28+
"MONEY": "generalize",
29+
"PERSON": "replace_semantic",
30+
"LOCATION": "replace_semantic",
31+
"ORGANIZATION": "replace_semantic"
32+
}
33+
}
34+
}
35+
36+
DEFAULT_PROFILE = "pseudonymized"

core/transforms.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import re
2+
3+
def generalize_money(text: str) -> dict:
4+
"""
5+
Generalizuje kwoty pieniężne, zastępując je przedziałami.
6+
Przykład: 521 000,00 zł -> około 500-550 tys. PLN
7+
"""
8+
# Prosty regex dla kwot, można go rozwijać
9+
pattern = re.compile(r"(\d{1,3}(?:[ .]\d{3})*(?:,\d{2})?)\s?(z[łl])", re.IGNORECASE)
10+
substitution_dict = {}
11+
12+
for match in pattern.finditer(text):
13+
original_string = match.group(0)
14+
numeric_part = match.group(1).replace(" ", "").replace(".", "")
15+
value = float(numeric_part.replace(",", "."))
16+
17+
if value > 1_000_000:
18+
rounded = round(value / 100_000) * 100_000
19+
replacement = f"ponad {rounded / 1_000_000:.1f} mln PLN"
20+
elif value > 10_000:
21+
rounded = round(value / 10_000) * 10_000
22+
replacement = f"około {rounded/1000:.0f} tys. PLN"
23+
else:
24+
replacement = "poniżej 10 tys. PLN"
25+
26+
if original_string not in substitution_dict:
27+
substitution_dict[original_string] = replacement
28+
29+
return substitution_dict
30+
31+
def generalize_dates(text: str) -> dict:
32+
"""
33+
Generalizuje daty do kwartałów lub miesięcy.
34+
Przykład: 12.03.2023 -> Q1 2023
35+
"""
36+
pattern = re.compile(r"\b(\d{1,2})[./-](\d{1,2})[./-](\d{4})\b")
37+
substitution_dict = {}
38+
39+
for match in pattern.finditer(text):
40+
original_string = match.group(0)
41+
_, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
42+
43+
if 1 <= month <= 3:
44+
quarter = "Q1"
45+
elif 4 <= month <= 6:
46+
quarter = "Q2"
47+
elif 7 <= month <= 9:
48+
quarter = "Q3"
49+
else:
50+
quarter = "Q4"
51+
52+
replacement = f"{quarter} {year}"
53+
if original_string not in substitution_dict:
54+
substitution_dict[original_string] = replacement
55+
56+
return substitution_dict

interfaces/cli.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import argparse
2+
from core.anonymizer import anonymize_text
3+
from .file_io import read_file, write_file, save_map_dict
4+
from utils.log import setup_logger
5+
6+
def main():
7+
parser = argparse.ArgumentParser(description="Anonymize documents from the command line.")
8+
parser.add_argument("--input", required=True, help="Path to the source file.")
9+
parser.add_argument("--output", required=True, help="Path to the output anonymized file.")
10+
parser.add_argument("--profile", default="pseudonymized", choices=["pseudonymized", "gdpr", "llm-safe"],
11+
help="Anonymization profile.")
12+
13+
# Opcja --classes staje się opcjonalna - nadpisuje ustawienia z profilu
14+
parser.add_argument("--classes", nargs='+', default=None,
15+
help="Custom list of classes to anonymize (overwrites profile setting).")
16+
17+
args = parser.parse_args()
18+
19+
# Skonfiguruj logger z nazwą pliku wejściowego
20+
logger = setup_logger(args.input)
21+
22+
# Odczyt pliku
23+
original_text = read_file(args.input)
24+
if not original_text:
25+
return
26+
27+
# Anonimizacja
28+
logger.info(f"Starting anonymization for {args.input} using profile: {args.profile}...")
29+
anonymized_text, substitution_map = anonymize_text(original_text, args.profile, args.classes)
30+
31+
# Zapis wyników
32+
write_file(args.output, anonymized_text)
33+
34+
map_file_path = args.output.rsplit('.', 1)[0] + '_map.json'
35+
if substitution_map:
36+
save_map_dict(map_file_path, substitution_map)
37+
logger.info(f"Anonymization complete. Output saved to: {args.output}")
38+
logger.info(f"Substitution map saved to: {map_file_path}")
39+
else:
40+
logger.info("No tokenized entities found to create a map.")
41+
42+
if __name__ == "__main__":
43+
main()

interfaces/file_io.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import json
2+
3+
def read_file(file_path: str) -> str:
4+
"""Odczytuje zawartość pliku tekstowego."""
5+
try:
6+
with open(file_path, "r", encoding="utf-8") as f:
7+
return f.read()
8+
except FileNotFoundError:
9+
print(f"Błąd: Plik nie został znaleziony: {file_path}")
10+
return ""
11+
12+
def write_file(file_path: str, content: str):
13+
"""Zapisuje zawartość do pliku tekstowego."""
14+
with open(file_path, "w", encoding="utf-8") as f:
15+
f.write(content)
16+
17+
def save_map_dict(file_path: str, substitution_dict: dict):
18+
"""Zapisuje słownik mapowań do pliku JSON."""
19+
with open(file_path, "w", encoding="utf-8") as f:
20+
json.dump(substitution_dict, f, ensure_ascii=False, indent=2)

0 commit comments

Comments
 (0)