-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathWeb Scraping - vietnamteachingjobs.applescript
More file actions
198 lines (183 loc) · 6.45 KB
/
Web Scraping - vietnamteachingjobs.applescript
File metadata and controls
198 lines (183 loc) · 6.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
-- Scrape jobs from vietnamteachingjobs.com
#repeated click load button until there's nothing more to load
tell application "Safari"
activate
make new document
tell document 1
set URL to "https://vietnamteachingjobs.com"
delay 3
set clickCount to 0
set loadButton to "Load 22 more"
repeat until (do JavaScript "document.getElementsByClassName('wpjb-ls-load-more').length") is 0
if do JavaScript "document.getElementsByClassName('wpjb-ls-load-more').length" is greater than 0 then
do JavaScript "document.getElementsByClassName('wpjb-ls-load-more')[0].firstChild.firstChild.click()"
delay 1
set clickCount to clickCount + 1
# tell application "System Events" to key code 125 using command down
end if
end repeat
end tell
end tell
#grab strings
set theString to {}
tell application "Safari"
tell document 1
set URLCount to do JavaScript "document.getElementsByClassName('wpjb-column-title').length"
repeat with i from 0 to URLCount - 1
set end of theString to do JavaScript "document.getElementsByClassName('wpjb-column-title')[" & i & "].innerHTML"
end repeat
end tell
end tell
#More processing of the strings to get the list of URLs
set theString2 to {}
repeat with i in theString
set end of theString2 to paragraph 2 of i
end repeat
set theString3 to {}
repeat with i in theString2
set end of theString3 to decoupe(i, {"\"", "\""})
end repeat
set theURLs to {}
repeat with i in theString3
try
set end of theURLs to item 2 of i
end try
end repeat
tell application "Numbers"
make new document
tell document 1
tell sheet 1
tell table 1
repeat 5 times
add column after last column
end repeat
tell row 1
set value of cell 1 to "Job Title"
set value of cell 2 to "Company Name"
set value of cell 3 to "Location"
set value of cell 4 to "Start Date"
set value of cell 5 to "Category"
set value of cell 6 to "Job Type"
set value of cell 7 to "Nationality of Teacher"
set value of cell 8 to "Teaching Experience"
set value of cell 9 to "Candidate Requirements"
set value of cell 10 to "Salary"
set value of cell 11 to "URL"
set value of cell 12 to "Job Order"
end tell
end tell
end tell
end tell
end tell
repeat with i from 1 to count of theURLs
tell application "Safari"
tell document 1
set URL to item i of theURLs
delay 3
set theTitle to do JavaScript "document.getElementsByClassName('entry-title')[0].textContent"
set theCompany to do JavaScript "document.getElementsByClassName('wpjb-info-label wpjb-company-name')[0].childNodes[1].firstElementChild.textContent.trim()"
repeat with j from 0 to 9
try
if (do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].textContent.trim()") contains "Date Posted" then
set datePosted to do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].nextElementSibling.textContent.trim()"
end if
end try
try
if (do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].textContent.trim()") contains "Category" then
set category to do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].nextElementSibling.innerText.trim()"
end if
end try
try
if (do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].textContent.trim()") contains "Job Type" then
set jobType to do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].nextElementSibling.childNodes[3].firstElementChild.textContent"
end if
end try
try
if (do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].textContent.trim()") contains "Nationality Of teacher" then
set nationalityOfTeacher to do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].nextElementSibling.textContent.trim()"
end if
end try
try
if (do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].textContent.trim()") contains "Teaching Experience" then
set teachingExperience to do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].nextElementSibling.textContent.trim()"
end if
end try
try
if (do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].textContent.trim()") contains "Candidate Requirements" then
set candidateRequirements to do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].nextElementSibling.textContent.trim()"
end if
end try
try
if (do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].textContent.trim()") contains "Where is the school" then
set location to do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].nextElementSibling.textContent.trim()"
end if
end try
try
if (do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].textContent.trim()") contains "Salary:" then
set salary to do JavaScript "document.getElementsByClassName('wpjb-info-label')[" & j & "].nextElementSibling.textContent.trim()"
end if
end try
end repeat
set theURL to URL
end tell
end tell
tell application "Numbers"
tell document 1
tell sheet 1
tell table 1
if (not (exists row (i + 1))) then
add row below last row
end if
<<<<<<< HEAD
=======
>>>>>>> dadfbfa32878df6976ab1abaec0bb31a09e916f1
tell row (i + 1)
try
set value of cell 1 to theTitle
end try
try
set value of cell 2 to theCompany
end try
try
set value of cell 3 to location
end try
try
set value of cell 4 to datePosted
end try
try
set value of cell 5 to category
end try
try
set value of cell 6 to jobType
end try
try
set value of cell 7 to nationalityOfTeacher
end try
try
set value of cell 8 to teachingExperience
end try
try
set value of cell 9 to candidateRequirements
end try
try
set value of cell 10 to salary
end try
try
set value of cell 11 to theURL
end try
try
set value of cell 12 to i
end try
end tell
end tell
end tell
end tell
end tell
end repeat
on decoupe(t, d)
local oTIDs, l
set {oTIDs, AppleScript's text item delimiters} to {AppleScript's text item delimiters, d}
set l to text items of t
set AppleScript's text item delimiters to oTIDs
return l
end decoupe