Skip to content

Commit f9bb5fc

Browse files
committed
temp
1 parent c473789 commit f9bb5fc

File tree

1 file changed

+123
-0
lines changed

1 file changed

+123
-0
lines changed

Self_Tracking/journal_to_csv.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import sys
2+
import pandas as pd
3+
from docx import Document
4+
import math
5+
from itertools import takewhile
6+
7+
doc = Document(sys.argv[1])
8+
rows = []
9+
current_year = None
10+
current_section = None
11+
current_content = []
12+
CHUNK_SZ = 10_000 # python xlsx cutsoff at 32k chars silently
13+
EXCLUDED_SECTIONS = ["Plan", "Weekly Summaries, starting on Monday inclusive of Sunday."]
14+
15+
for p in doc.paragraphs:
16+
style, text = p.style.name, p.text
17+
if not text.strip():
18+
continue
19+
if style == "Heading 2":
20+
current_year = text.strip()
21+
elif style == "Heading 4":
22+
if current_content and current_section not in EXCLUDED_SECTIONS:
23+
full = "\n".join(current_content)
24+
total_len = len(full)
25+
print(total_len, len(current_content), CHUNK_SZ)
26+
n_chunks = math.ceil(total_len / CHUNK_SZ)
27+
target = math.ceil(total_len / n_chunks)
28+
chunks = []
29+
buf = []
30+
buf_len = 0
31+
for line in current_content:
32+
line_len = len(line) + 1
33+
if buf and buf_len + line_len > target:
34+
chunks.append(buf)
35+
buf = []
36+
buf_len = 0
37+
buf.append(line)
38+
buf_len += line_len
39+
if buf:
40+
chunks.append(buf)
41+
42+
for i, chunk in enumerate(chunks, start=1):
43+
rows.append(
44+
{
45+
"Year": current_year,
46+
"Section": f"{current_section} ({i})",
47+
"Content": "\n".join(chunk),
48+
}
49+
)
50+
current_section = text.strip()
51+
current_content = []
52+
else:
53+
if current_section and current_section not in EXCLUDED_SECTIONS:
54+
current_content.append(text.rstrip())
55+
56+
if current_section and current_section not in EXCLUDED_SECTIONS:
57+
rows.append(
58+
{"Year": current_year, "Section": current_section, "Content": "\n".join(current_content)}
59+
)
60+
61+
df = pd.DataFrame(rows, columns=["Year", "Section", "Content"])
62+
df.to_excel(sys.argv[2], engine="openpyxl", index=False)
63+
64+
65+
# %%
66+
if True:
67+
print(1 / 0)
68+
69+
import sys
70+
import csv
71+
from docx import Document
72+
import math
73+
74+
doc = Document(sys.argv[1])
75+
rows = []
76+
current_year = None
77+
current_section = None
78+
current_content = []
79+
80+
EXCLUDED_SECTIONS = ["Plan", "Weekly Summaries, starting on Monday inclusive of Sunday."]
81+
82+
for p in doc.paragraphs:
83+
s = p.style.name
84+
t = p.text.strip()
85+
if not t:
86+
continue
87+
88+
if s == "Heading 2":
89+
current_year = t
90+
elif s == "Heading 4":
91+
# ignore first data at start before journal
92+
if current_section and current_content and current_section not in EXCLUDED_SECTIONS:
93+
rows.append(
94+
{
95+
"Year": current_year,
96+
"Section": current_section,
97+
"Content": "\n".join(current_content),
98+
}
99+
)
100+
current_section = t
101+
current_content = []
102+
else:
103+
# Only add content if we have all required fields and section is not excluded
104+
if current_section not in EXCLUDED_SECTIONS:
105+
current_content.append(t)
106+
107+
# Add the last section if it exists
108+
if current_section and current_content and current_section not in EXCLUDED_SECTIONS:
109+
rows.append(
110+
{
111+
"Year": current_year,
112+
"Section": current_section,
113+
"Content": "\n".join(current_content),
114+
}
115+
)
116+
117+
# Write to CSV with proper quoting
118+
with open(sys.argv[2], "w", newline="", encoding="utf-8") as f:
119+
w = csv.DictWriter(
120+
f, fieldnames=["Year", "Section", "Content"], quoting=csv.QUOTE_ALL, escapechar="\\"
121+
)
122+
w.writeheader()
123+
w.writerows(rows)

0 commit comments

Comments
 (0)