-
Notifications
You must be signed in to change notification settings - Fork 87
Expand file tree
/
Copy pathsync.py
More file actions
247 lines (213 loc) · 9.65 KB
/
sync.py
File metadata and controls
247 lines (213 loc) · 9.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import collections
import singer
from singer import bookmarks
from tap_github.streams import STREAMS
LOGGER = singer.get_logger()
STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships']
def get_selected_streams(catalog):
'''
Gets selected streams. Checks schema's 'selected'
first -- and then checks metadata, looking for an empty
breadcrumb and mdata with a 'selected' entry
'''
selected_streams = []
for stream in catalog['streams']:
stream_metadata = stream['metadata']
for entry in stream_metadata:
# Stream metadata will have an empty breadcrumb
if not entry['breadcrumb'] and entry['metadata'].get('selected',None):
selected_streams.append(stream['tap_stream_id'])
return selected_streams
def update_currently_syncing(state, stream_name):
"""
Updates currently syncing stream in the state.
"""
if not stream_name and singer.get_currently_syncing(state):
del state['currently_syncing']
else:
singer.set_currently_syncing(state, stream_name)
singer.write_state(state)
def update_currently_syncing_repo(state, repo_path):
"""
Updates currently syncing repository in the state.
and flushes `currently_syncing_repo` when all repositories are synced.
"""
if (not repo_path) and ('currently_syncing_repo' in state):
del state['currently_syncing_repo']
else:
state['currently_syncing_repo'] = repo_path
singer.write_state(state)
def get_ordered_stream_list(currently_syncing, streams_to_sync):
"""
Get an ordered list of remaining streams to sync other streams followed by synced streams.
"""
stream_list = list(sorted(streams_to_sync))
if currently_syncing in stream_list:
index = stream_list.index(currently_syncing)
stream_list = stream_list[index:] + stream_list[:index]
return stream_list
def get_ordered_repos(state, repositories):
"""
Get an ordered list of remaining repos to sync followed by synced repos.
"""
syncing_repo = state.get("currently_syncing_repo")
if syncing_repo in repositories:
index = repositories.index(syncing_repo)
repositories = repositories[index:] + repositories[:index]
return repositories
def translate_state(state, catalog, repositories):
'''
The tap supports multiple repositories. Previously, the state format
for bookmarks included stream keys nested under each repository, as
shown below:
{
"bookmarks": {
"singer-io/tap-adwords": {
"commits": {
"since": "2018-11-14T13:21:20.700360Z"
}
}
"singer-io/tap-salesforce": {
"commits": {
"since": "2018-11-14T13:21:20.700360Z"
}
}
}
}
The stream keys must be the second key after bookmarks in order for
standardized table-level resets to function correctly. This function
should be called at the start of each run to ensure that the state
is properly converted to the new format:
{
"bookmarks": {
"commits" : {
"singer-io/tap-adwords": {
"since": "2018-11-14T13:21:20.700360Z"
},
"singer-io/tap-salesforce": {
"since": "2018-11-14T13:21:20.700360Z"
}
},
"issues" : {
"singer-io/tap-adwords": {
"since": "2018-11-14T13:21:20.700360Z"
},
"singer-io/tap-salesforce": {
"since": "2018-11-14T13:21:20.700360Z"
}
}
}
}
'''
nested_dict = lambda: collections.defaultdict(nested_dict)
new_state = nested_dict()
# Collect keys(stream_name for update state or repo_name for older state) from state available in the `bookmarks``
previous_state_keys = state.get('bookmarks', {}).keys()
# Collect stream names from the catalog
stream_names = [stream['tap_stream_id'] for stream in catalog['streams']]
for key in previous_state_keys:
# Loop through each key of `bookmarks` available in the previous state.
for inner_key, inner_value in state['bookmarks'][key].items():
if inner_key in stream_names or key in repositories:
new_state['bookmarks'][inner_key][key] = inner_value
else:
new_state['bookmarks'][key][inner_key] = inner_value
for stream in catalog['streams']:
stream_name = stream['tap_stream_id']
for repo in repositories:
if bookmarks.get_bookmark(state, stream_name, repo):
new_state['bookmarks'][stream_name][repo] = bookmarks.get_bookmark(state, stream_name, repo)
if bookmarks.get_bookmark(state, repo, stream_name):
new_state['bookmarks'][stream_name][repo] = bookmarks.get_bookmark(state, repo, stream_name)
# Preserve other key-value pairs in state
for key, value in state.items():
if key != "bookmarks":
new_state[key] = value
return new_state
def get_stream_to_sync(catalog):
"""
Get the streams for which the sync function should be called(the parent in case of selected child streams).
"""
streams_to_sync = []
selected_streams = get_selected_streams(catalog)
for stream_name, stream_obj in STREAMS.items():
if stream_name in selected_streams or is_any_child_selected(stream_obj, selected_streams):
# Append the selected stream or deselected parent stream into the list, if its child or nested child is selected.
streams_to_sync.append(stream_name)
return streams_to_sync
def is_any_child_selected(stream_obj,selected_streams):
"""
Check if any of the child streams is selected for the parent.
"""
if stream_obj.children:
for child in stream_obj.children:
if child in selected_streams:
return True
if STREAMS[child].children:
return is_any_child_selected(STREAMS[child], selected_streams)
return False
def write_schemas(stream_id, catalog, selected_streams):
"""
Write the schemas for each stream.
"""
stream_obj = STREAMS[stream_id]()
if stream_id in selected_streams:
# Get catalog object for particular stream.
stream = [cat for cat in catalog['streams'] if cat['tap_stream_id'] == stream_id ][0]
singer.write_schema(stream_id, stream['schema'], stream['key_properties'])
for child in stream_obj.children:
write_schemas(child, catalog, selected_streams)
def sync(client, config, state, catalog):
"""
Sync selected streams.
"""
start_date = config['start_date']
# Get selected streams, make sure stream dependencies are met
selected_stream_ids = get_selected_streams(catalog)
streams_to_sync = get_stream_to_sync(catalog)
LOGGER.info('Sync stream %s', streams_to_sync)
repositories, organizations = client.extract_repos_from_config()
state = translate_state(state, catalog, repositories)
singer.write_state(state)
# Sync `teams`, `team_members`and `team_memberships` streams just single time for any organization.
streams_to_sync_for_orgs = set(streams_to_sync).intersection(STREAM_TO_SYNC_FOR_ORGS)
# Loop through all organizations
if selected_stream_ids:
for orgs in organizations:
LOGGER.info("Starting sync of organization: %s", orgs)
do_sync(catalog, streams_to_sync_for_orgs, selected_stream_ids, client, start_date, state, orgs)
# Sync other streams for all repos
streams_to_sync_for_repos = set(streams_to_sync) - streams_to_sync_for_orgs
# pylint: disable=too-many-nested-blocks
# Sync repositories only if any streams are selected
for repo in get_ordered_repos(state, repositories):
update_currently_syncing_repo(state, repo)
LOGGER.info("Starting sync of repository: %s", repo)
do_sync(catalog, streams_to_sync_for_repos, selected_stream_ids, client, start_date, state, repo)
if client.not_accessible_repos:
# Give warning messages for a repo that is not accessible by a stream or is invalid.
message = "Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository for following streams {}.".format(repo, ", ".join(client.not_accessible_repos))
LOGGER.warning(message)
client.not_accessible_repos = set()
update_currently_syncing_repo(state, None)
def do_sync(catalog, streams_to_sync, selected_stream_ids, client, start_date, state, repo):
"""
Sync all other streams except teams, team_members and team_memberships for each repo.
"""
currently_syncing = singer.get_currently_syncing(state)
for stream_id in get_ordered_stream_list(currently_syncing, streams_to_sync):
stream_obj = STREAMS[stream_id]()
# If it is a "sub_stream", it will be synced as part of the parent stream
if stream_id in streams_to_sync and not stream_obj.parent:
write_schemas(stream_id, catalog, selected_stream_ids)
update_currently_syncing(state, stream_id)
state = stream_obj.sync_endpoint(client = client,
state = state,
catalog = catalog['streams'],
repo_path = repo,
start_date = start_date,
selected_stream_ids = selected_stream_ids,
stream_to_sync = streams_to_sync
)
singer.write_state(state)
update_currently_syncing(state, None)