-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSplitter.py
More file actions
76 lines (63 loc) · 2.11 KB
/
Splitter.py
File metadata and controls
76 lines (63 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import re
import pandas as pd
def RegEX(string):
file_1_str = re.sub(r"[([{})]]", "", string) #removing brackets
file_1_str = re.sub("\d+", "", file_1_str) #removing numbers
file_1_str = re.sub(r"\s+", " ", file_1_str)#removing tabs
file_1_str = re.sub(r'[^\w\s]', '', file_1_str)#remove punc
return re.sub('\u0304', '', file_1_str)
#Change to smaller chunks/upsampled dates csv file
data= pd.read_csv('LatLibDates-Filtered.csv', encoding= 'latin_1')
data.rename(columns={'V1': 'Text', 'V2': 'Target'}, inplace=True)
texts = data['Text']
labels = data['Target']
print("number of texts :" , len(texts))
print(texts[0])
Names = []
for i in range(len(texts)):
Names.append(texts[i])
os.chdir('LatLib')
for i in range(len(texts)):
if "/" in texts[i]:
s = texts[i]
s = s.split('/')
s1 = s[0]
s2 = s[1]
os.chdir(s1)
with open(s2,'r') as f:
New_texts = f.read()
texts[i] = New_texts[100:]
os.chdir('..')
# print(f"Sub{i}")
else:
with open(texts[i],'r') as f:
New_texts = f.read()
texts[i] = New_texts[100:]
# print(f"YEE{i}")
UWU = []
for i in range(len(texts)):
texts[i] = RegEX(texts[i])
for i in range(len(Names)):
Names[i] = Names[i].replace('/','')
os.chdir('/home/sittch/Spring2023/DLT2/LatLib_500char_unpunc')
for i in range(len(texts)):
if len(texts[i]) >= 500:
# print(Names[i])
chunk_lim = len(texts[i])//500
# print(chunk_lim)
chunk_text = [''.join(item) for item in zip(*[iter(texts[i])]*500)]
# print(chunk_text[0])
for j in range(chunk_lim):
with open(f"{Names[i]}_{j}.txt", "w") as text_file:
print(chunk_text[j], file=text_file)
UWU.append(f"{Names[i]}_{j}.txt")
else:
with open(f"{Names[i]}.txt", "w") as text_file:
print(texts[i], file=text_file)
UWU.append(f"{Names[i]}.txt")
# import csv
with open(os.path.join('/home/sittch/Spring2023/DLT2','Final_Rank.csv'),'w') as f:
for line in UWU:
f.write(line)
f.write('\n')