Skip to content

Commit 1465ce3

Browse files
author
Sergey Bondarkov
committed
bugfix: Erroneous values in line 2400 (net profit)
1 parent 7cc5291 commit 1465ce3

2 files changed

Lines changed: 76 additions & 38 deletions

File tree

code/1_financials/4_combine_rosstat_fns_panels.R

Lines changed: 51 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ rosstat_financials_20172018_new_obs <- fns_financials_20172018[!rosstat_financia
1212
rosstat_financials <- rbindlist(list(
1313
rosstat_financials,
1414
rosstat_financials_20172018_new_obs
15-
),
15+
),
1616
use.names = T, fill = T
1717
)
1818

@@ -23,40 +23,63 @@ combined_financials <- rbindlist(list(
2323
fns_financials[year >= 2019]
2424
),
2525
fill = T, use.names = T
26+
)
2627

2728
# Check
28-
print(dcast(combined_financials[, .N, keyby = .(year, all_na)], year ~ paste0("all_na_", all_na)))
29-
print(dcast(combined_financials[, .N, keyby = .(year, new_obs)], year ~ paste0("new_obs_", new_obs)))
30-
print(dcast(combined_financials[, .N, keyby = .(new_obs, all_na)], new_obs ~ paste0("all_na_", all_na)))
31-
print(dcast(combined_financials[, .N, keyby = .(simplified, all_na)], simplified ~ paste0("all_na_", all_na)))
32-
print(combined_financials[, lapply(.SD, mean), .SDcols = patterns("imp_"), keyby = year])
29+
# print(dcast(combined_financials[, .N, keyby = .(year, all_na)], year ~ paste0("all_na_", all_na)))
30+
# print(dcast(combined_financials[, .N, keyby = .(year, new_obs)], year ~ paste0("new_obs_", new_obs)))
31+
# print(dcast(combined_financials[, .N, keyby = .(new_obs, all_na)], new_obs ~ paste0("all_na_", all_na)))
32+
# print(dcast(combined_financials[, .N, keyby = .(simplified, all_na)], simplified ~ paste0("all_na_", all_na)))
33+
# print(combined_financials[, lapply(.SD, mean), .SDcols = patterns("imp_"), keyby = year])
3334

34-
# Negative lines values to positive =================================================
35+
# Bracketed negative lines values to positive =================================================
3536

36-
neg_lines <- paste0("line_", c(1320:1323,
37-
2120:2123,
38-
2210:2213,
39-
2220:2223,
40-
2330:2333,
41-
2350:2353,
42-
2411,
43-
3220:3227,
44-
3320:3327,
45-
4120:4129,
46-
4220:4229,
47-
4320:4329,
48-
6310:6313,
49-
6320:6326,
50-
6330,
51-
6350:6359,
52-
6300
53-
))
37+
neg_lines_full <- paste0("line_", c(
38+
1320:1323,
39+
2120:2123,
40+
2210:2213,
41+
2220:2223,
42+
2330:2333,
43+
2350:2353,
44+
2411,
45+
3220:3227,
46+
3320:3327,
47+
4120:4129,
48+
4220:4229,
49+
4320:4329,
50+
6310:6313,
51+
6320:6326,
52+
6330,
53+
6350:6359,
54+
6300
55+
))
5456

5557
# Not all the lines are present in data so we repack them as regex
56-
neg_lines_pattern <- paste(neg_lines, collapse="|")
57-
neg_lines_present <- grep(neg_lines_pattern, names(combined_financials), value = T)
58+
neg_lines_pattern_full <- paste(neg_lines_full, collapse="|")
59+
neg_lines_present_full <- grep(neg_lines_pattern_full, names(combined_financials), value = T)
60+
combined_financials[simplified == 0, (neg_lines_present_full) := lapply(.SD, function(l) fifelse(l < 0, -l, l)), .SDcols = neg_lines_present_full]
61+
62+
neg_lines_simple <- paste0("line_", c(
63+
2120,
64+
2330,
65+
2350,
66+
2410,
67+
3220:3227,
68+
3320:3327,
69+
4120:4129,
70+
4220:4229,
71+
4320:4329,
72+
6310,
73+
6320,
74+
6330,
75+
6350,
76+
6300
77+
))
5878

59-
combined_financials[, (neg_lines_present) := lapply(.SD, function(l) fifelse(l < 0, -l, l)), .SDcols = neg_lines_present]
79+
# Not all the lines are present in data so we repack them as regex
80+
neg_lines_pattern_simple <- paste(neg_lines_simple, collapse="|")
81+
neg_lines_present_simple <- grep(neg_lines_pattern_simple, names(combined_financials), value = T)
82+
combined_financials[simplified == 1, (neg_lines_present_simple) := lapply(.SD, function(l) fifelse(l < 0, -l, l)), .SDcols = neg_lines_present_simple]
6083

6184
# Save
6285
setorderv(combined_financials, c("inn", "year"))

code/1_financials/6_adjust_values.R

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ setDTthreads(0)
55
# Load data
66
russian_financials <- read_fst("temp/combined_financials_impFrNY_negLinesCorr.fst", as.data.table = T)
77
russian_financials[, adj_any := 0]
8-
# russian_financials[inn == "0100000621", .SD, .SDcols = patterns("line_1")]
98

109
# Declare imputation function
1110
impute <- function(dt, imp_target, lines_to_sum, flag_imputation = T) {
@@ -26,7 +25,6 @@ impute <- function(dt, imp_target, lines_to_sum, flag_imputation = T) {
2625
dt[, imp_value := NULL, env = env]
2726
dt[, useless := NULL]
2827
dt[, orig_value := NULL]
29-
# return(dt)
3028
}
3129

3230
# Function demo:
@@ -52,9 +50,11 @@ XXX0_lines_for_simple_imp <- c("line_3210", "line_3220",
5250

5351
for(imp_target in XXX0_lines_for_simple_imp) {
5452
regex <- paste0(stringi::stri_sub(imp_target, 1, -2), "[1-9x]") # e.g. "line_1230" > "line_123[1-9]"
53+
# print(regex)
5554
lines_to_sum <- grep(regex, names(russian_financials), value = T)
5655
# message(imp_target, " = ", paste(lines_to_sum, collapse = " + "))
5756
impute(russian_financials, imp_target, lines_to_sum)
57+
}
5858

5959
# Higher level imputation: sum of XX[1-9]0 lines -----------------------------
6060
XX00_lines_for_simple_imp <- c("line_1100", "line_1200",
@@ -64,6 +64,14 @@ XX00_lines_for_simple_imp <- c("line_1100", "line_1200",
6464

6565
for(imp_target in XX00_lines_for_simple_imp) {
6666
regex <- paste0(stringi::stri_sub(imp_target, 1, -3), "[1-9]0") # e.g. "line_1200" > "line_12[1-9]0"
67+
lines_to_sum <- grep(regex, names(russian_financials), value = T)
68+
# message(imp_target, " = ", paste(lines_to_sum, collapse = " + "))
69+
impute(russian_financials, imp_target, lines_to_sum)
70+
}
71+
# russian_financials[inn == "0100001343", .SD, .SDcols = patterns("year|imp|line_1")]
72+
# russian_financials_simple[inn == "0100001343", .SD, .SDcols = patterns("year|imp|line_1")]
73+
# russian_financials[inn == "0100000621", .SD, .SDcols = patterns("year|imp|line_1")]
74+
# russian_financials_full[inn == "0100000621", .SD, .SDcols = patterns("year|imp|line_1")]
6775

6876
# Imputation using fomulas common to simplified and non-simplifieds ---------
6977
russian_financials[, line_6300_neg := -line_6300]
@@ -81,19 +89,22 @@ gc()
8189
for(l in c("line_2120", "line_2210", "line_2220", "line_2330", "line_2350",
8290
"line_3220", "line_3320", "line_4120", "line_4220", "line_4320")) {
8391
russian_financials_full[, l_neg := -l, env = list(l = l, l_neg = paste0(l, "_neg"))]
92+
}
8493
### Impute
8594
impute(russian_financials_full, "line_1600", c("line_1100", "line_1200"))
8695
impute(russian_financials_full, "line_1700", c("line_1300", "line_1400", "line_1500"))
8796
impute(russian_financials_full, "line_2100", c("line_2110", "line_2120_neg"))
8897
impute(russian_financials_full, "line_2200", c("line_2100", "line_2210_neg", "line_2220_neg"))
8998
impute(russian_financials_full, "line_2300", c("line_2200", "line_2310", "line_2320", "line_2330_neg", "line_2340", "line_2350_neg"))
9099

91-
92-
93100
# Impute 24XX
94-
russian_financials_full[, line_2410_neg := -line_2410]
95-
impute(russian_financials_full[year >= 2020], "line_2410", c("line_2411", "line_2412"))
96-
impute(russian_financials_full, "line_2400", c("line_2300", "line_2410_neg", "line_2460"))
101+
russian_financials_full[, line_2411_neg := -line_2411]
102+
impute(russian_financials_full, "line_2410", c("line_2411_neg", "line_2412"))
103+
impute(russian_financials_full, "line_2400", c("line_2300", "line_2410", "line_2460"))
104+
105+
# # Check
106+
# russian_financials_full[inn == "7703443256", .(year, line_2400, line_2300, line_2410, line_2411, line_2460)]
107+
# russian_financials_full[year >= 2019][inn %in% sample(inn, 5), .(inn, year, line_2400, line_2300, line_2410, line_2411, line_2460)]
97108

98109
## Construct 24XX lines with the same meaning across different periods
99110
russian_financials_full[, line_2410_uniform_tax := NA_real_]
@@ -149,8 +160,11 @@ impute(russian_financials_full, "line_4400", c("line_4100", "line_4200", "line_4
149160
impute(russian_financials_full, "line_4500", c("line_4400", "line_4450", "line_4490"))
150161

151162
## Simplified statements
163+
### Prepare some lines
152164
for(l in c("line_2120", "line_2330", "line_2350", "line_2410")) {
153165
russian_financials_simple[, l_neg := -l, env = list(l = l, l_neg = paste0(l, "_neg"))]
166+
}
167+
### Impute
154168
impute(russian_financials_simple, "line_1600", c("line_1150", "line_1170", "line_1210", "line_1250", "line_1230"))
155169
impute(russian_financials_simple, "line_1700", c("line_1300", "line_1350", "line_1360", "line_1410", "line_1450", "line_1510", "line_1520", "line_1550"))
156170
impute(russian_financials_simple, "line_2200", c("line_2110", "line_2120_neg"))
@@ -165,12 +179,13 @@ russian_financials <- rbindlist(list(russian_financials_full, russian_financials
165179
lines_to_delete <- grep("\\d_neg", names(russian_financials), value = T)
166180
russian_financials[, (lines_to_delete) := NULL]
167181

168-
print(russian_financials[, .(adj_any = mean(adj_any)), keyby = year])
169-
print(russian_financials[, .N, keyby = year])
182+
# print(russian_financials[, .(adj_any = mean(adj_any)), keyby = year])
183+
# print(russian_financials[, .N, keyby = year])
170184

171185
# Tidy up and save
172186
setorderv(russian_financials, c("inn", "year"))
173-
write_fst(russian_financials, "output/russian_financials_2011_2023_imp_adj.fst")
187+
maxyear <- last(russian_financials$year)
188+
write_fst(russian_financials, glue::glue("output/russian_financials_2011_{maxyear}.fst"))
174189

175190

176191

0 commit comments

Comments
 (0)