Skip to content

Commit f5fedc7

Browse files
committed
initial main commit
1 parent f764579 commit f5fedc7

26 files changed

+952
-154
lines changed

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,9 @@ env.bak/
6969
venv.bak/
7070

7171
# logs
72-
logs/
72+
logs/
73+
# Pliki generowane przez testy
74+
tests/test_anonymizer_output.py
75+
tests/test_document-big.actual.txt
76+
# Spacy models
77+
models/

core/anonymizer.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,18 @@ class Entity(NamedTuple):
1919
nlp = None
2020

2121
def safe_substitute(text: str, substitution_dict: dict) -> str:
22+
"""
23+
Substitutes tokens in a single pass, ensuring that replacements are handled correctly
24+
even if they overlap, by processing them from longest to shortest.
25+
"""
26+
# Sort keys by length in descending order to handle nested substitutions correctly.
27+
# For example, "Jan Kowalski" should be replaced before "Jan".
2228
sorted_keys = sorted(substitution_dict.keys(), key=len, reverse=True)
29+
2330
for original in sorted_keys:
2431
replacement = substitution_dict[original]
2532
text = text.replace(original, replacement)
33+
2634
return text
2735

2836
def anonymize_text(text: str, profile: str = DEFAULT_PROFILE, custom_classes: list[str] = None) -> tuple[str, dict]:
@@ -40,10 +48,11 @@ def anonymize_text(text: str, profile: str = DEFAULT_PROFILE, custom_classes: li
4048
for match in pattern.finditer(text):
4149
start, end = match.span()
4250
# Sprawdź, czy zakres się nie nakłada
43-
if not any(start in r or end - 1 in r for r in matched_pos):
51+
current_range = range(start, end)
52+
if not any(max(r.start, current_range.start) < min(r.stop, current_range.stop) for r in matched_pos):
4453
entity = Entity(match.group(0), class_name, start, end)
4554
all_entities.append(entity)
46-
matched_pos.add(range(start, end))
55+
matched_pos.add(current_range)
4756
logger.debug(f"Regex found: {entity}")
4857

4958
# Krok 2: Znajdź encje z NER, ignorując te, które nakładają się na regex
@@ -75,7 +84,7 @@ def anonymize_text(text: str, profile: str = DEFAULT_PROFILE, custom_classes: li
7584

7685
if transform_type == "tokenize":
7786
count = token_counters.get(entity.type, 0)
78-
substitution_dict[entity.text] = f"<{entity.type}_{count}>"
87+
substitution_dict[entity.text] = f"__{entity.type}_{count}__"
7988
token_counters[entity.type] = count + 1
8089
elif transform_type == "remove":
8190
substitution_dict[entity.text] = ""
@@ -94,5 +103,5 @@ def anonymize_text(text: str, profile: str = DEFAULT_PROFILE, custom_classes: li
94103
substitution_dict.update(generalize_money(temp_text))
95104

96105
anonymized_text = safe_substitute(text, substitution_dict)
97-
token_map = {v: k for k, v in substitution_dict.items() if v.startswith('<')}
106+
token_map = {v: k for k, v in substitution_dict.items() if v.startswith('__') and v.endswith('__')}
98107
return anonymized_text, token_map

core/patterns.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,60 @@
1111
("REGON", re.compile(r"\b\d{9,14}\b")),
1212
# Poprawiony, prostszy wzorzec dla organizacji w cudzysłowach
1313
("ORGANIZATION", re.compile(r'"([^"]+ S\.A\.)"', re.IGNORECASE)),
14+
# Wzorzec dla kwot słownych, np. "sto tysięcy złotych"
15+
("MONEY_TEXT", re.compile(r"""
16+
\b(
17+
(?:
18+
(?:jeden|dwa|trzy|cztery|pięć|sześć|siedem|osiem|dziewięć|dziesięć|
19+
jedenaście|dwanaście|trzynaście|czternaście|piętnaście|szesnaście|siedemnaście|osiemnaście|dziewiętnaście|
20+
dwadzieścia|trzydzieści|czterdzieści|pięćdziesiąt|sześćdziesiąt|siedemdziesiąt|osiemdziesiąt|dziewięćdziesiąt|
21+
sto|dwieście|trzysta|czterysta|pięćset|sześćset|siedemset|osiemset|dziewięćset|
22+
tysiąc|tysiące|tysięcy|milion|miliony|milionów|miliard|miliardy|miliardów)
23+
[\s,-]*
24+
)+
25+
)
26+
\s+(?:złotych|złote|złoty|pln|zł)\b
27+
""", re.IGNORECASE | re.VERBOSE)),
1428
("MONEY", re.compile(r"(\d{1,3}(?:[ .]\d{3})*(?:,\d{2})?)\s?(z[łl])", re.IGNORECASE)),
29+
# Wzorce dla powierzchni. Najpierw bardziej szczegółowy (z opisem), potem ogólny.
30+
# To zapobiega sytuacji, w której ogólny wzorzec dopasowuje tylko część dłuższego wyrażenia.
31+
("AREA", re.compile(r"""
32+
\b
33+
(?:ok\.\s*)?
34+
(?:
35+
(?:
36+
(?:\d{1,3}(?:[ .,]\d{3})*|\d+)(?:[,.]\d+)?\s*(?:ha|a|m²|m|cm²|cm|mm|km²)
37+
)
38+
(?:
39+
\s*(?:x\s*)?(?:\d{1,3}(?:[ .,]\d{3})*|\d+)(?:[,.]\d+)?\s*(?:a|m²|m|cm²|cm|mm|km²)
40+
){0,2}
41+
)
42+
\s*\([^)]+\) # Opis w nawiasie jest tutaj obowiązkowy
43+
""", re.IGNORECASE | re.VERBOSE)),
44+
("AREA", re.compile(r"""
45+
\b
46+
(?:ok\.\s*)?
47+
(?:
48+
(?:
49+
(?:\d{1,3}(?:[ .,]\d{3})*|\d+)(?:[,.]\d+)?\s*(?:ha|a|m²|m|cm²|cm|mm|km²)
50+
)
51+
(?:
52+
\s*(?:x\s*)?(?:\d{1,3}(?:[ .,]\d{3})*|\d+)(?:[,.]\d+)?\s*(?:a|m²|m|cm²|cm|mm|km²)
53+
){0,2}
54+
)
55+
\b
56+
""", re.IGNORECASE | re.VERBOSE)),
1557
("DATE", re.compile(r"\b\d{1,2}\.\d{1,2}\.\d{4}\b")),
1658
("DATE", re.compile(r"\b\d{1,2} [a-zA-Z]+ \d{4} roku\b", re.IGNORECASE)),
59+
("POST_CODE", re.compile(r"\b\d{2}-\d{3}\b")),
60+
("STREET_ADDRESS", re.compile(r"""
61+
\b(
62+
(?:ul|al|pl|os)\.?\s+
63+
(?:[A-ZŻŹĆĄŚĘŁÓŃa-zżźćńółęąś-]+\s+)+
64+
\d{1,4}
65+
(?:[a-zA-Z])?
66+
(?:/\d{1,4})?
67+
)\b
68+
""", re.IGNORECASE | re.VERBOSE)),
69+
("LAND_PLOT", re.compile(r"\b\d+\s*/\s*\d+\b")),
1770
]

core/profile_config.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,45 @@
33
PROFILES = {
44
"pseudonymized": {
55
"description": "Pseudonimizacja: Zamiana encji na tagi, z możliwością odwrócenia (generuje mapę).",
6-
"classes": ["PESEL", "NIP", "REGON", "KW", "PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
6+
"classes": [
7+
"PESEL", "NIP", "REGON", "KW", "PERSON", "LOCATION", "ORGANIZATION",
8+
"MONEY", "MONEY_TEXT", "DATE", "POST_CODE", "STREET_ADDRESS", "LAND_PLOT", "AREA"
9+
],
710
"transformations": {
811
"DATE": "tokenize",
9-
"MONEY": "tokenize"
12+
"POST_CODE": "tokenize",
13+
"STREET_ADDRESS": "tokenize",
14+
"LAND_PLOT": "tokenize",
15+
"MONEY": "tokenize",
16+
"MONEY_TEXT": "tokenize",
17+
"AREA": "tokenize"
1018
}
1119
},
1220
"gdpr": {
1321
"description": "Anonimizacja RODO: Nieodwracalna generalizacja danych.",
14-
"classes": ["PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
22+
"classes": ["PERSON", "LOCATION", "ORGANIZATION", "MONEY", "MONEY_TEXT", "DATE", "POST_CODE", "STREET_ADDRESS", "LAND_PLOT"],
1523
"transformations": {
1624
"DATE": "generalize",
25+
"POST_CODE": "remove",
26+
"STREET_ADDRESS": "remove",
27+
"LAND_PLOT": "remove",
1728
"MONEY": "generalize",
29+
"MONEY_TEXT": "generalize",
1830
"PERSON": "remove",
1931
"LOCATION": "generalize_loc",
2032
"ORGANIZATION": "remove"
2133
}
2234
},
2335
"llm-safe": {
2436
"description": "Profil LLM-Safe: Maksymalna anonimizacja, zastępowanie semantyczne.",
25-
"classes": ["PESEL", "NIP", "REGON", "KW", "PERSON", "LOCATION", "ORGANIZATION", "MONEY", "DATE"],
37+
"classes": ["PESEL", "NIP", "REGON", "KW", "PERSON", "LOCATION", "ORGANIZATION", "MONEY", "MONEY_TEXT", "DATE", "POST_CODE", "STREET_ADDRESS", "LAND_PLOT"],
2638
"transformations": {
2739
"DATE": "generalize",
40+
"POST_CODE": "tokenize",
41+
"STREET_ADDRESS": "tokenize",
42+
"LAND_PLOT": "tokenize",
2843
"MONEY": "generalize",
44+
"MONEY_TEXT": "generalize",
2945
"PERSON": "replace_semantic",
3046
"LOCATION": "replace_semantic",
3147
"ORGANIZATION": "replace_semantic"

interfaces/__init__.py

Whitespace-only changes.

interfaces/cli.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import argparse
2+
import os
23
from core.anonymizer import anonymize_text
34
from .file_io import read_file, write_file, save_map_dict
45
from utils.log import setup_logger
56

67
def main():
78
parser = argparse.ArgumentParser(description="Anonymize documents from the command line.")
8-
parser.add_argument("--input", required=True, help="Path to the source file.")
9-
parser.add_argument("--output", required=True, help="Path to the output anonymized file.")
9+
parser.add_argument("-i", "--input", required=True, help="Path to the source file.")
10+
parser.add_argument("-o", "--output", help="Path to the output anonymized file. Defaults to <input>.anon.<ext>")
1011
parser.add_argument("--profile", default="pseudonymized", choices=["pseudonymized", "gdpr", "llm-safe"],
1112
help="Anonymization profile.")
1213

@@ -16,6 +17,11 @@ def main():
1617

1718
args = parser.parse_args()
1819

20+
# Jeśli plik wyjściowy nie jest podany, utwórz go na podstawie nazwy pliku wejściowego
21+
if not args.output:
22+
base, ext = os.path.splitext(args.input)
23+
args.output = f"{base}.anon{ext}"
24+
1925
# Skonfiguruj logger z nazwą pliku wejściowego
2026
logger = setup_logger(args.input)
2127

interfaces/gradio_ui.py

Lines changed: 111 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,133 @@
1+
import os
12
import gradio as gr
23
from core.anonymizer import anonymize_text
34
import tempfile
45
import json
5-
66
from core.profile_config import PROFILES
77

8-
def anonymize_interface(file_obj, profile):
9-
if file_obj is None:
10-
return "Proszę wgrać plik.", "Brak wyników.", None
8+
# Wyłącz analytics
9+
os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
10+
os.environ['HF_HUB_OFFLINE'] = '1'
1111

12-
original_text = file_obj.decode('utf-8')
12+
class AnonymizerInterface:
13+
"""Wrapper dla interfejsu anonimizatora"""
1314

14-
anonymized_text, substitution_map = anonymize_text(original_text, profile)
15+
@staticmethod
16+
def create_temp_file(content, filename, extension):
17+
"""Tworzy tymczasowy plik z contentem"""
18+
if not content:
19+
return None
20+
21+
with tempfile.NamedTemporaryFile(
22+
delete=False,
23+
mode="w",
24+
suffix=extension,
25+
prefix=filename + "_",
26+
encoding="utf-8"
27+
) as tmp:
28+
if extension == ".json":
29+
json.dump(content, tmp, ensure_ascii=False, indent=2)
30+
else:
31+
tmp.write(content)
32+
return tmp.name
1533

16-
# Tworzenie pliku do pobrania
17-
if substitution_map:
18-
with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".json", encoding="utf-8") as tmp:
19-
json.dump(substitution_map, tmp, ensure_ascii=False, indent=2)
20-
map_file_path = tmp.name
21-
else:
22-
map_file_path = None
23-
24-
return anonymized_text, substitution_map, map_file_path
34+
@staticmethod
35+
def process_file(file_obj, profile):
36+
"""Przetwarza plik i zwraca wyniki anonimizacji"""
37+
if file_obj is None:
38+
return "Proszę wgrać plik.", "Brak wyników.", None, None
39+
40+
original_text = file_obj.decode('utf-8')
41+
anonymized_text, substitution_map = anonymize_text(original_text, profile)
42+
43+
# Tworzenie plików do pobrania
44+
map_file_path = AnonymizerInterface.create_temp_file(
45+
substitution_map,
46+
"mapowania",
47+
".json"
48+
)
49+
50+
text_file_path = AnonymizerInterface.create_temp_file(
51+
anonymized_text,
52+
"tekst_anonimizowany",
53+
".txt"
54+
)
55+
56+
return anonymized_text, substitution_map, map_file_path, text_file_path
2557

26-
def launch():
27-
with gr.Blocks() as demo:
28-
gr.Markdown("# Anonimizator Umów Notarialnych")
58+
def create_ui():
59+
"""Tworzy interfejs użytkownika"""
60+
with gr.Blocks(
61+
css="""
62+
* {
63+
font-family: system-ui, -apple-system, BlinkMacSystemFont, sans-serif !important;
64+
}
65+
link[rel="manifest"] { display: none !important; }
66+
""",
67+
theme=gr.themes.Soft()
68+
) as demo:
2969

70+
gr.Markdown("# Janusz Danych Rodo - Anonimizator Umów Notarialnych")
71+
72+
# Wiersz 1: Wgranie pliku, profil, przycisk
73+
with gr.Row():
74+
file_input = gr.File(
75+
label="Wgraj plik (.txt, .md)",
76+
type="binary",
77+
file_count="single"
78+
)
79+
profile_dropdown = gr.Dropdown(
80+
choices=list(PROFILES.keys()),
81+
value="pseudonymized",
82+
label="Profil Anonimizacji"
83+
)
84+
submit_btn = gr.Button("Anonimizuj")
85+
86+
# Wiersz 2: Tekst zanonimizowany
87+
with gr.Row():
88+
output_text = gr.Textbox(
89+
label="Tekst zanonimizowany",
90+
lines=15,
91+
interactive=True
92+
)
93+
94+
# Wiersz 3: Słownik mapowań i pobieranie
3095
with gr.Row():
3196
with gr.Column():
32-
file_input = gr.File(label="Wgraj plik (.txt, .md)", type="binary")
33-
profile_dropdown = gr.Dropdown(
34-
choices=list(PROFILES.keys()),
35-
value="pseudonymized",
36-
label="Profil Anonimizacji"
37-
)
38-
submit_btn = gr.Button("Anonimizuj")
97+
output_map = gr.JSON(label="Słownik mapowań")
3998

4099
with gr.Column():
41-
output_text = gr.Textbox(label="Tekst zanonimizowany", lines=15)
42-
output_map = gr.JSON(label="Słownik mapowań")
43-
download_map_btn = gr.File(label="Pobierz słownik mapowań")
44-
100+
gr.Markdown("### Pobierz pliki")
101+
with gr.Group():
102+
download_map_btn = gr.File(label="Słownik mapowań")
103+
download_text_btn = gr.File(label="Tekst anonimizowany")
104+
45105
submit_btn.click(
46-
fn=anonymize_interface,
106+
fn=AnonymizerInterface.process_file,
47107
inputs=[file_input, profile_dropdown],
48-
outputs=[output_text, output_map, download_map_btn]
108+
outputs=[output_text, output_map, download_map_btn, download_text_btn]
49109
)
110+
111+
return demo
112+
113+
def launch():
114+
"""Uruchamia aplikację"""
115+
print("Working directory:", os.getcwd())
116+
print("Static dir exists:", os.path.exists("static"))
117+
print("Manifest exists:", os.path.exists("static/manifest.json"))
118+
print(gr.__version__)
119+
print(hasattr(gr, 'set_static_paths'))
50120

51-
demo.launch()
121+
demo = create_ui()
122+
demo.launch(
123+
share=False,
124+
server_name="127.0.0.1",
125+
server_port=7860,
126+
inbrowser=True,
127+
quiet=True,
128+
show_error=True,
129+
favicon_path=None,
130+
)
52131

53132
if __name__ == "__main__":
54133
launch()

0 commit comments

Comments
 (0)