@@ -5,7 +5,6 @@ setDTthreads(0)
55# Load data
66russian_financials <- read_fst(" temp/combined_financials_impFrNY_negLinesCorr.fst" , as.data.table = T )
77russian_financials [, adj_any : = 0 ]
8- # russian_financials[inn == "0100000621", .SD, .SDcols = patterns("line_1")]
98
109# Declare imputation function
1110impute <- function (dt , imp_target , lines_to_sum , flag_imputation = T ) {
@@ -26,7 +25,6 @@ impute <- function(dt, imp_target, lines_to_sum, flag_imputation = T) {
2625 dt [, imp_value : = NULL , env = env ]
2726 dt [, useless : = NULL ]
2827 dt [, orig_value : = NULL ]
29- # return(dt)
3028}
3129
3230# Function demo:
@@ -52,9 +50,11 @@ XXX0_lines_for_simple_imp <- c("line_3210", "line_3220",
5250
5351for (imp_target in XXX0_lines_for_simple_imp ) {
5452 regex <- paste0(stringi :: stri_sub(imp_target , 1 , - 2 ), " [1-9x]" ) # e.g. "line_1230" > "line_123[1-9]"
53+ # print(regex)
5554 lines_to_sum <- grep(regex , names(russian_financials ), value = T )
5655 # message(imp_target, " = ", paste(lines_to_sum, collapse = " + "))
5756 impute(russian_financials , imp_target , lines_to_sum )
57+ }
5858
5959# Higher level imputation: sum of XX[1-9]0 lines -----------------------------
6060XX00_lines_for_simple_imp <- c(" line_1100" , " line_1200" ,
@@ -64,6 +64,14 @@ XX00_lines_for_simple_imp <- c("line_1100", "line_1200",
6464
6565for (imp_target in XX00_lines_for_simple_imp ) {
6666 regex <- paste0(stringi :: stri_sub(imp_target , 1 , - 3 ), " [1-9]0" ) # e.g. "line_1200" > "line_12[1-9]0"
67+ lines_to_sum <- grep(regex , names(russian_financials ), value = T )
68+ # message(imp_target, " = ", paste(lines_to_sum, collapse = " + "))
69+ impute(russian_financials , imp_target , lines_to_sum )
70+ }
71+ # russian_financials[inn == "0100001343", .SD, .SDcols = patterns("year|imp|line_1")]
72+ # russian_financials_simple[inn == "0100001343", .SD, .SDcols = patterns("year|imp|line_1")]
73+ # russian_financials[inn == "0100000621", .SD, .SDcols = patterns("year|imp|line_1")]
74+ # russian_financials_full[inn == "0100000621", .SD, .SDcols = patterns("year|imp|line_1")]
6775
6876# Imputation using fomulas common to simplified and non-simplifieds ---------
6977russian_financials [, line_6300_neg : = - line_6300 ]
8189for (l in c(" line_2120" , " line_2210" , " line_2220" , " line_2330" , " line_2350" ,
8290 " line_3220" , " line_3320" , " line_4120" , " line_4220" , " line_4320" )) {
8391 russian_financials_full [, l_neg : = - l , env = list (l = l , l_neg = paste0(l , " _neg" ))]
92+ }
8493# ## Impute
8594impute(russian_financials_full , " line_1600" , c(" line_1100" , " line_1200" ))
8695impute(russian_financials_full , " line_1700" , c(" line_1300" , " line_1400" , " line_1500" ))
8796impute(russian_financials_full , " line_2100" , c(" line_2110" , " line_2120_neg" ))
8897impute(russian_financials_full , " line_2200" , c(" line_2100" , " line_2210_neg" , " line_2220_neg" ))
8998impute(russian_financials_full , " line_2300" , c(" line_2200" , " line_2310" , " line_2320" , " line_2330_neg" , " line_2340" , " line_2350_neg" ))
9099
91-
92-
93100# Impute 24XX
94- russian_financials_full [, line_2410_neg : = - line_2410 ]
95- impute(russian_financials_full [year > = 2020 ], " line_2410" , c(" line_2411" , " line_2412" ))
96- impute(russian_financials_full , " line_2400" , c(" line_2300" , " line_2410_neg" , " line_2460" ))
101+ russian_financials_full [, line_2411_neg : = - line_2411 ]
102+ impute(russian_financials_full , " line_2410" , c(" line_2411_neg" , " line_2412" ))
103+ impute(russian_financials_full , " line_2400" , c(" line_2300" , " line_2410" , " line_2460" ))
104+
105+ # # Check
106+ # russian_financials_full[inn == "7703443256", .(year, line_2400, line_2300, line_2410, line_2411, line_2460)]
107+ # russian_financials_full[year >= 2019][inn %in% sample(inn, 5), .(inn, year, line_2400, line_2300, line_2410, line_2411, line_2460)]
97108
98109# # Construct 24XX lines with the same meaning across different periods
99110russian_financials_full [, line_2410_uniform_tax : = NA_real_ ]
@@ -149,8 +160,11 @@ impute(russian_financials_full, "line_4400", c("line_4100", "line_4200", "line_4
149160impute(russian_financials_full , " line_4500" , c(" line_4400" , " line_4450" , " line_4490" ))
150161
151162# # Simplified statements
163+ # ## Prepare some lines
152164for (l in c(" line_2120" , " line_2330" , " line_2350" , " line_2410" )) {
153165 russian_financials_simple [, l_neg : = - l , env = list (l = l , l_neg = paste0(l , " _neg" ))]
166+ }
167+ # ## Impute
154168impute(russian_financials_simple , " line_1600" , c(" line_1150" , " line_1170" , " line_1210" , " line_1250" , " line_1230" ))
155169impute(russian_financials_simple , " line_1700" , c(" line_1300" , " line_1350" , " line_1360" , " line_1410" , " line_1450" , " line_1510" , " line_1520" , " line_1550" ))
156170impute(russian_financials_simple , " line_2200" , c(" line_2110" , " line_2120_neg" ))
@@ -165,12 +179,13 @@ russian_financials <- rbindlist(list(russian_financials_full, russian_financials
165179lines_to_delete <- grep(" \\ d_neg" , names(russian_financials ), value = T )
166180russian_financials [, (lines_to_delete ) : = NULL ]
167181
168- print(russian_financials [, .(adj_any = mean(adj_any )), keyby = year ])
169- print(russian_financials [, .N , keyby = year ])
182+ # print(russian_financials[, .(adj_any = mean(adj_any)), keyby = year])
183+ # print(russian_financials[, .N, keyby = year])
170184
171185# Tidy up and save
172186setorderv(russian_financials , c(" inn" , " year" ))
173- write_fst(russian_financials , " output/russian_financials_2011_2023_imp_adj.fst" )
187+ maxyear <- last(russian_financials $ year )
188+ write_fst(russian_financials , glue :: glue(" output/russian_financials_2011_{maxyear}.fst" ))
174189
175190
176191
0 commit comments