Skip to content

Commit 112de37

Browse files
committed
uprava vyberu nejhorsiho kriteria
1 parent 586ed9c commit 112de37

File tree

3 files changed

+366
-220
lines changed

3 files changed

+366
-220
lines changed

kod/development.ipynb

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 2,
66
"id": "c509aba7",
77
"metadata": {},
88
"outputs": [],
@@ -29,8 +29,8 @@
2929
"from processing import *\n",
3030
"\n",
3131
"pl.Config.set_tbl_cols(-1)\n",
32-
"# os.chdir(r'E:\\CVUT_BAP')\n",
33-
"os.chdir(r'C:\\Users\\adamp\\Projects\\CVUT_BAP')"
32+
"os.chdir(r'E:\\CVUT_BAP')\n",
33+
"# os.chdir(r'C:\\Users\\adamp\\Projects\\CVUT_BAP')"
3434
]
3535
},
3636
{
@@ -2142,11 +2142,48 @@
21422142
"source": [
21432143
"df['Obd_KomunikacniProtokol'].value_counts().sort(by='count')"
21442144
]
2145+
},
2146+
{
2147+
"cell_type": "markdown",
2148+
"id": "0b906215",
2149+
"metadata": {},
2150+
"source": [
2151+
"# Analyza vzorku dat pro dalsi rozhodovani"
2152+
]
2153+
},
2154+
{
2155+
"cell_type": "code",
2156+
"execution_count": 3,
2157+
"id": "d3894581",
2158+
"metadata": {},
2159+
"outputs": [
2160+
{
2161+
"ename": "FileNotFoundError",
2162+
"evalue": "The system cannot find the path specified. (os error 3): kod/data/data_z_mericich_pristroju/parquet/nafta_osobni",
2163+
"output_type": "error",
2164+
"traceback": [
2165+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
2166+
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
2167+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df_mereni = cast_mereni(\u001b[43mpl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mkod/data/data_z_mericich_pristroju/parquet/nafta_osobni\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mschema\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnafta_schema\u001b[49m\u001b[43m)\u001b[49m).sample(fraction=\u001b[32m1.0\u001b[39m, shuffle=\u001b[38;5;28;01mTrue\u001b[39;00m, seed=SEED)\n\u001b[32m 2\u001b[39m describe(df_mereni)\n",
2168+
"\u001b[36mFile \u001b[39m\u001b[32me:\\CVUT_BAP\\.venv\\Lib\\site-packages\\polars\\_utils\\deprecation.py:128\u001b[39m, in \u001b[36mdeprecate_renamed_parameter.<locals>.decorate.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 123\u001b[39m \u001b[38;5;129m@wraps\u001b[39m(function)\n\u001b[32m 124\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args: P.args, **kwargs: P.kwargs) -> T:\n\u001b[32m 125\u001b[39m _rename_keyword_argument(\n\u001b[32m 126\u001b[39m old_name, new_name, kwargs, function.\u001b[34m__qualname__\u001b[39m, version\n\u001b[32m 127\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m128\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
2169+
"\u001b[36mFile \u001b[39m\u001b[32me:\\CVUT_BAP\\.venv\\Lib\\site-packages\\polars\\_utils\\deprecation.py:128\u001b[39m, in \u001b[36mdeprecate_renamed_parameter.<locals>.decorate.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 123\u001b[39m \u001b[38;5;129m@wraps\u001b[39m(function)\n\u001b[32m 124\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args: P.args, **kwargs: P.kwargs) -> T:\n\u001b[32m 125\u001b[39m _rename_keyword_argument(\n\u001b[32m 126\u001b[39m old_name, new_name, kwargs, function.\u001b[34m__qualname__\u001b[39m, version\n\u001b[32m 127\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m128\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
2170+
"\u001b[36mFile \u001b[39m\u001b[32me:\\CVUT_BAP\\.venv\\Lib\\site-packages\\polars\\io\\parquet\\functions.py:289\u001b[39m, in \u001b[36mread_parquet\u001b[39m\u001b[34m(source, columns, n_rows, row_index_name, row_index_offset, parallel, use_statistics, hive_partitioning, glob, schema, hive_schema, try_parse_hive_dates, rechunk, low_memory, storage_options, credential_provider, retries, use_pyarrow, pyarrow_options, memory_map, include_file_paths, missing_columns, allow_missing_columns)\u001b[39m\n\u001b[32m 286\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 287\u001b[39m lf = lf.select(columns)\n\u001b[32m--> \u001b[39m\u001b[32m289\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
2171+
"\u001b[36mFile \u001b[39m\u001b[32me:\\CVUT_BAP\\.venv\\Lib\\site-packages\\polars\\_utils\\deprecation.py:97\u001b[39m, in \u001b[36mdeprecate_streaming_parameter.<locals>.decorate.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 93\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mengine\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[33m\"\u001b[39m\u001b[33min-memory\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mdel\u001b[39;00m kwargs[\u001b[33m\"\u001b[39m\u001b[33mstreaming\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m---> \u001b[39m\u001b[32m97\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
2172+
"\u001b[36mFile \u001b[39m\u001b[32me:\\CVUT_BAP\\.venv\\Lib\\site-packages\\polars\\lazyframe\\opt_flags.py:328\u001b[39m, in \u001b[36mforward_old_opt_flags.<locals>.decorate.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 325\u001b[39m optflags = cb(optflags, kwargs.pop(key)) \u001b[38;5;66;03m# type: ignore[no-untyped-call,unused-ignore]\u001b[39;00m\n\u001b[32m 327\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33moptimizations\u001b[39m\u001b[33m\"\u001b[39m] = optflags\n\u001b[32m--> \u001b[39m\u001b[32m328\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
2173+
"\u001b[36mFile \u001b[39m\u001b[32me:\\CVUT_BAP\\.venv\\Lib\\site-packages\\polars\\lazyframe\\frame.py:2429\u001b[39m, in \u001b[36mLazyFrame.collect\u001b[39m\u001b[34m(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, engine, background, optimizations, **_kwargs)\u001b[39m\n\u001b[32m 2427\u001b[39m \u001b[38;5;66;03m# Only for testing purposes\u001b[39;00m\n\u001b[32m 2428\u001b[39m callback = _kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mpost_opt_callback\u001b[39m\u001b[33m\"\u001b[39m, callback)\n\u001b[32m-> \u001b[39m\u001b[32m2429\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m wrap_df(\u001b[43mldf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallback\u001b[49m\u001b[43m)\u001b[49m)\n",
2174+
"\u001b[31mFileNotFoundError\u001b[39m: The system cannot find the path specified. (os error 3): kod/data/data_z_mericich_pristroju/parquet/nafta_osobni"
2175+
]
2176+
}
2177+
],
2178+
"source": [
2179+
"df_mereni = cast_mereni(pl.read_parquet('kod/data/data_z_mericich_pristroju/parquet/nafta_osobni', schema=nafta_schema)).sample(fraction=1.0, shuffle=True, seed=SEED)\n",
2180+
"describe(df_mereni)"
2181+
]
21452182
}
21462183
],
21472184
"metadata": {
21482185
"kernelspec": {
2149-
"display_name": ".venv",
2186+
"display_name": ".venv (3.13.7)",
21502187
"language": "python",
21512188
"name": "python3"
21522189
},

kod/preprocess.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import gzip
33
import shutil
44
from functools import partial
5-
from statistics import mean
65

76
import pandas as pd
87
import pyarrow as pa
@@ -490,15 +489,15 @@ def fill_result_list(vyusteni_element_list, result_lists, required_list, categor
490489

491490

492491
# Vybere z každého vyústění hodnotu, která je považována za nejhorší
493-
def select_worst(result_lists, strategy_dict):
492+
def select_worst(result_lists, strategy_dict, already_parsed):
494493
result = {}
495494
for name, result_list in result_lists.items():
496495
try:
497496
if not result_list:
498497
result[name] = None
499498
continue
500499
if 'Min' in name or 'Max' in name or 'Vysledek' in name:
501-
result[name] = next((result for result in result_list if result is not None), None)
500+
result[name] = next((val for val in result_list if val is not None), None)
502501
continue
503502
strategy = strategy_dict[name.split('_')[-2]]
504503
floats = floats_sublist(result_list)
@@ -508,20 +507,25 @@ def select_worst(result_lists, strategy_dict):
508507
float_result = max(floats, default=None)
509508
case 'min':
510509
float_result = min(floats, default=None)
511-
case 'mean':
512-
float_result = mean(floats) # Vyvolá výjimku v případě prázdného seznamu
513510
case 'max_diff_1':
514511
float_result = max(floats, default=None, key=lambda x: abs(x - 1.0))
515512
case 'bounds':
516-
name_stem = name.partition("_Hodnota")[0]
517513
try:
518-
min_value = float(result[f'{name_stem}_Min_Hodnota'])
519-
max_value = float(result[f'{name_stem}_Max_Hodnota'])
514+
if name.startswith('Nafta'):
515+
# Pro naftu jsou limity v samostatném bloku MereniVznetLimit
516+
param = name.split('_')[-2]
517+
min_value = float(already_parsed[f'Nafta_MereniVznetLimit_{param}_Min_Hodnota'])
518+
max_value = float(already_parsed[f'Nafta_MereniVznetLimit_{param}_Max_Hodnota'])
519+
else:
520+
# Pro benzín/plyn jsou limity součástí aktuálního záznamu
521+
name_stem = name.partition("_Hodnota")[0]
522+
min_value = float(result[f'{name_stem}_Min_Hodnota'])
523+
max_value = float(result[f'{name_stem}_Max_Hodnota'])
524+
520525
optimal_value = (max_value + min_value) / 2
521526
float_result = max(floats, default=None, key=lambda x: abs(x - optimal_value))
522-
# Pokud by některá z krajních hodnot chyběla vezmu první záznam o otáčkách
523527
except Exception:
524-
float_result = next((float for float in floats if float is not None), None)
528+
float_result = next((f for f in floats if f is not None), None)
525529
# Cast na string, aby bylo zachováno načtení všech hodnot jako string
526530
if float_result is not None:
527531
result[name] = str(float_result)
@@ -543,8 +547,8 @@ def get_detail_benzin(element, prefix, namespaces):
543547
categories = {'OtackyVolnobezne': ('otackyVolnobezne', 1), 'OtackyZvysene': ('otackyZvysene', 1)}
544548
result_lists = initialize_result_list(required_list, categories, prefix)
545549
fill_result_list(benzin_vyusteni_element_list, result_lists, required_list, categories, prefix, namespaces)
546-
strategy_dict = {'CO': 'max', 'CO2': 'min', 'COCOOR': 'max', 'HC': 'max', 'LAMBDA': 'max_diff_1', 'N': 'bounds', 'NOX': 'max', 'O2': 'max', 'TPS': 'max'}
547-
result |= select_worst(result_lists, strategy_dict)
550+
strategy_dict = {'CO': 'max', 'CO2': 'min', 'COCOOR': 'max', 'HC': 'max', 'LAMBDA': 'max_diff_1', 'N': 'bounds', 'NOX': 'max', 'O2': 'max', 'TPS': 'min'}
551+
result |= select_worst(result_lists, strategy_dict, result)
548552
return result
549553

550554

@@ -575,8 +579,8 @@ def get_detail_nafta(element, prefix, namespaces):
575579
categories = {'MereniPrumer': ('mereniPrumer', 1), 'Mereni': ('mereni', 4)}
576580
result_lists = initialize_result_list(required_list, categories, prefix)
577581
fill_result_list(nafta_vyusteni_element_list, result_lists, required_list, categories, prefix, namespaces)
578-
strategy_dict = {'TPS': 'mean', 'CasAkcelerace': 'max', 'Kourivost': 'max', 'OtackyPrebehove': 'mean', 'OtackyVolnobezne': 'mean', 'Teplota': 'min'}
579-
result |= select_worst(result_lists, strategy_dict)
582+
strategy_dict = {'TPS': 'min', 'CasAkcelerace': 'max', 'Kourivost': 'max', 'OtackyPrebehove': 'bounds', 'OtackyVolnobezne': 'bounds', 'Teplota': 'min'}
583+
result |= select_worst(result_lists, strategy_dict, result)
580584
return result
581585

582586

@@ -980,6 +984,13 @@ def parse_stations_file(target_dir, xml_file, verbosity, delete):
980984
def run_preprocessing():
981985
explain_verbosity(config.VERBOSITY)
982986

987+
print('—————————————————————————————————Stanice STK a SME:—————————————————————————————————————————————\n')
988+
# Seznam stanic prochází denní aktualizací
989+
clear_folder(config.STATIONS_DIR, config.VERBOSITY)
990+
download_stations(config.SPARQL_ENDPOINT, config.STATIONS_DIR / 'gz', config.DATASET_STATIONS, config.VERBOSITY)
991+
extract_files(config.STATIONS_DIR / 'gz', config.STATIONS_DIR / 'xml', 1, config.VERBOSITY)
992+
parse_series_to_parquet(config.STATIONS_DIR / 'xml', config.STATIONS_DIR / 'parquet', parse_stations_file, 1, config.VERBOSITY, False)
993+
983994
print('——————————————————————————————————PROHLÍDKY VOZIDEL STK A SME:——————————————————————————————————\n')
984995
downloaded_inspection_dates = downloaded_dates([config.INSPECTIONS_DIR / 'gz', config.INSPECTIONS_DIR / 'xml', config.INSPECTIONS_DIR / 'parquet'])
985996
download_files(config.SPARQL_ENDPOINT, config.INSPECTIONS_DIR / 'gz', config.PARENT_DATASET_INSPECTIONS, config.START_DATE, config.END_DATE, downloaded_inspection_dates, config.NO_DOWNLOAD_THREADS, config.MAX_DOWNLOAD_ATTEMPTS, config.VERBOSITY)
@@ -992,12 +1003,6 @@ def run_preprocessing():
9921003
extract_files(config.MEASUREMENTS_DIR / 'gz', config.MEASUREMENTS_DIR / 'xml', config.NO_EXTRACT_THREADS, config.VERBOSITY)
9931004
parse_series_to_parquet(config.MEASUREMENTS_DIR / 'xml', config.MEASUREMENTS_DIR / 'parquet', parse_measurements_file, config.NO_PARSE_PROCESSES, config.VERBOSITY, False)
9941005

995-
print('—————————————————————————————————Stanice STK a SME:—————————————————————————————————————————————\n')
996-
# Seznam stanic prochází denní aktualizací
997-
clear_folder(config.STATIONS_DIR, config.VERBOSITY)
998-
download_stations(config.SPARQL_ENDPOINT, config.STATIONS_DIR / 'gz', config.DATASET_STATIONS, config.VERBOSITY)
999-
extract_files(config.STATIONS_DIR / 'gz', config.STATIONS_DIR / 'xml', 1, config.VERBOSITY)
1000-
parse_series_to_parquet(config.STATIONS_DIR / 'xml', config.STATIONS_DIR / 'parquet', parse_stations_file, 1, config.VERBOSITY, False)
10011006

10021007

10031008
if __name__ == '__main__':

0 commit comments

Comments
 (0)