Skip to content

Commit 10e6de3

Browse files
whitwhamdaviesrob
authored andcommitted
Add S3 v4 signature code for reading and writing.
Writing needs POST requests and the ability to read 301 (moved permamently) response data. As these features are not available from hfile_libcurl, a new plug-in called hfile_s3_write is added which uses libcurl directly. The main hfile_s3 plug-in passes write requests on to hfile_s3_write.
1 parent 5c7ff9c commit 10e6de3

8 files changed

Lines changed: 1697 additions & 49 deletions

File tree

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Makefile for htslib, a C library for high-throughput sequencing data formats.
22
#
3-
# Copyright (C) 2013-2018 Genome Research Ltd.
3+
# Copyright (C) 2013-2019 Genome Research Ltd.
44
#
55
# Author: John Marshall <jm18@sanger.ac.uk>
66
#
@@ -307,6 +307,7 @@ hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(hts
307307
hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h)
308308
hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h)
309309
hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h)
310+
hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
310311
hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
311312
hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h)
312313
hts_os.o hts_os.pico: hts_os.c config.h os/rand.c

NEWS

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,28 @@ Noteworthy changes in release a.b
4848
example, "/path1/my_data.bam##idx##/path2/my_index.csi" will open bam file
4949
"/path1/my_data.bam" and index file "/path2/my_index.csi".
5050

51+
* Changes to hfile_s3, which provides support for the AWS S3 API.
52+
53+
- hfile_s3 now uses version 4 signatures by default. Attempting to write to
54+
an S3 bucket will also now work correctly. It is possible to force
55+
version 2 signatures by creating environment variable HTS_S3_V2 (the exact
56+
value does not matter, it just has to exist). Note that writing depends
57+
on features that need version 4 signatures, so forcing version 2 will
58+
disable writes.
59+
60+
- hfile_s3 will automatically retry requests where the region endpoint
61+
was not specified correctly, either by following the 301 redirect (when
62+
using path-style requests) or reading the 400 response (when using
63+
virtual-hosted style requests and version 4 signatures). The first
64+
region to try can be set by using the AWS_DEFAULT_REGION environment
65+
variable, by setting "region" in ".aws/credentials" or by setting
66+
"bucket_location" in ".s3cfg".
67+
68+
- hfile_s3 now percent-escapes the path component of s3:// URLs. For
69+
backwards-compatibility it will ignore any paths that have already
70+
been escaped (detected by looking for '%' followed by two hexadecimal
71+
digits.)
72+
5173
* Fixed bug where some 8 or 16-bit negative integers were stored using values
5274
reserved by the BCF specification. These numbers are now promoted to the
5375
next size up, so -121 to -128 are stored using at least 16 bits, and -32761

config.mk.in

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Optional configure Makefile overrides for htslib.
22
#
3-
# Copyright (C) 2015-2017 Genome Research Ltd.
3+
# Copyright (C) 2015-2017, 2019 Genome Research Ltd.
44
#
55
# Author: John Marshall <jm18@sanger.ac.uk>
66
#
@@ -74,10 +74,12 @@ endif
7474

7575
ifeq "s3-@s3@" "s3-enabled"
7676
plugin_OBJS += hfile_s3.o
77+
plugin_OBJS += hfile_s3_write.o
7778

7879
CRYPTO_LIBS = @CRYPTO_LIBS@
7980
noplugin_LIBS += $(CRYPTO_LIBS)
8081
hfile_s3$(PLUGIN_EXT): LIBS += $(CRYPTO_LIBS)
82+
hfile_s3_write$(PLUGIN_EXT): LIBS += $(CRYPTO_LIBS) $(LIBCURL_LIBS)
8183
endif
8284

8385
ifeq "plugins-@enable_plugins@" "plugins-yes"
@@ -94,6 +96,7 @@ plugin.o plugin.pico: CPPFLAGS += -DPLUGINPATH=\"$(pluginpath)\"
9496
hfile_gcs.o hfile_gcs.pico: version.h
9597
hfile_libcurl.o hfile_libcurl.pico: version.h
9698
hfile_s3.o hfile_s3.pico: version.h
99+
hfile_s3_write.o hfile_s3_write.pico: version.h
97100

98101
# Windows DLL plugins depend on the import library, built as a byproduct.
99102
$(plugin_OBJS:.o=.cygdll): cyghts-$(LIBHTS_SOVERSION).dll

hfile.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,7 @@ static void load_hfile_plugins()
972972
#endif
973973
#ifdef ENABLE_S3
974974
init_add_plugin(NULL, hfile_plugin_init_s3, "s3");
975+
init_add_plugin(NULL, hfile_plugin_init_s3_write, "s3w");
975976
#endif
976977

977978
#endif

hfile_internal.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ extern int hfile_plugin_init(struct hFILE_plugin *self);
169169
extern int hfile_plugin_init_gcs(struct hFILE_plugin *self);
170170
extern int hfile_plugin_init_libcurl(struct hFILE_plugin *self);
171171
extern int hfile_plugin_init_s3(struct hFILE_plugin *self);
172+
extern int hfile_plugin_init_s3_write(struct hFILE_plugin *self);
172173
#endif
173174

174175
/* This one is never built as a separate plugin. */
@@ -179,6 +180,18 @@ extern int hfile_plugin_init_net(struct hFILE_plugin *self);
179180
// although we may consider exposing it in the API later.
180181
typedef int (* hts_httphdr_callback) (void *cb_data, char ***hdrs);
181182

183+
/** Callback for handling 3xx redirect responses from http connections.
184+
185+
@param data is passed to the callback
186+
@param response http response code (e.g. 301)
187+
@param headers http response headers
188+
@param new_url the callback should write the url to switch to in here
189+
190+
Currently used by s3 to handle switching region endpoints.
191+
*/
192+
typedef int (*redirect_callback) (void *data, long response,
193+
kstring_t *headers, kstring_t *new_url);
194+
182195
#ifdef __cplusplus
183196
}
184197
#endif

hfile_libcurl.c

Lines changed: 102 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* hfile_libcurl.c -- libcurl backend for low-level file streams.
22
3-
Copyright (C) 2015-2017 Genome Research Ltd.
3+
Copyright (C) 2015-2017, 2019 Genome Research Ltd.
44
55
Author: John Marshall <jm18@sanger.ac.uk>
66
@@ -77,13 +77,18 @@ typedef struct {
7777
hdrlist fixed; // List of headers supplied at hopen()
7878
hdrlist extra; // List of headers from callback
7979
hts_httphdr_callback callback; // Callback to get more headers
80-
void *callback_data; // Data to pass to callback
80+
void *callback_data; // Data to pass to httphdr callback
8181
auth_token *auth; // Authentication token
8282
int auth_hdr_num; // Location of auth_token in hdrlist extra
8383
// If -1, Authorization header is in fixed
8484
// -2, it came from the callback
8585
// -3, "auth_token_enabled", "false"
8686
// passed to hopen()
87+
redirect_callback redirect; // Callback to handle 3xx redirects
88+
void *redirect_data; // Data to pass to redirect_callback
89+
long *http_response_ptr; // Location to store http response code.
90+
int fail_on_error; // Open fails on >400 response code
91+
// (default true)
8792
} http_headers;
8893

8994
typedef struct {
@@ -107,6 +112,7 @@ typedef struct {
107112
unsigned tried_seek : 1; // At least one seek has been attempted
108113
int nrunning;
109114
http_headers headers;
115+
110116
off_t delayed_seek; // Location to seek to before reading
111117
off_t last_offset; // Location we're seeking from
112118
} hFILE_libcurl;
@@ -720,7 +726,10 @@ static size_t recv_callback(char *ptr, size_t size, size_t nmemb, void *fpv)
720726
hFILE_libcurl *fp = (hFILE_libcurl *) fpv;
721727
size_t n = size * nmemb;
722728

723-
if (n > fp->buffer.len) { fp->paused = 1; return CURL_WRITEFUNC_PAUSE; }
729+
if (n > fp->buffer.len) {
730+
fp->paused = 1;
731+
return CURL_WRITEFUNC_PAUSE;
732+
}
724733
else if (n == 0) return 0;
725734

726735
memcpy(fp->buffer.ptr.rd, ptr, n);
@@ -729,6 +738,19 @@ static size_t recv_callback(char *ptr, size_t size, size_t nmemb, void *fpv)
729738
return n;
730739
}
731740

741+
742+
size_t header_callback(void *contents, size_t size, size_t nmemb, void *userp) {
743+
size_t realsize = size * nmemb;
744+
kstring_t *resp = (kstring_t *)userp;
745+
746+
if (kputsn((const char *)contents, realsize, resp) == EOF) {
747+
return 0;
748+
}
749+
750+
return realsize;
751+
}
752+
753+
732754
static ssize_t libcurl_read(hFILE *fpv, void *bufferv, size_t nbytes)
733755
{
734756
hFILE_libcurl *fp = (hFILE_libcurl *) fpv;
@@ -764,8 +786,9 @@ static ssize_t libcurl_read(hFILE *fpv, void *bufferv, size_t nbytes)
764786
err = curl_easy_pause(fp->easy, CURLPAUSE_CONT);
765787
if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); return -1; }
766788

767-
while (! fp->paused && ! fp->finished)
789+
while (! fp->paused && ! fp->finished) {
768790
if (wait_perform(fp) < 0) return -1;
791+
}
769792

770793
got = fp->buffer.ptr.rd - buffer;
771794

@@ -1087,6 +1110,8 @@ libcurl_open(const char *url, const char *modes, http_headers *headers)
10871110
CURLcode err;
10881111
CURLMcode errm;
10891112
int save, is_recursive;
1113+
kstring_t in_header = {0, 0, NULL};
1114+
long response;
10901115

10911116
is_recursive = strchr(modes, 'R') != NULL;
10921117

@@ -1163,28 +1188,83 @@ libcurl_open(const char *url, const char *modes, http_headers *headers)
11631188
goto error;
11641189
if ((list = get_header_list(fp)) != NULL)
11651190
err |= curl_easy_setopt(fp->easy, CURLOPT_HTTPHEADER, list);
1166-
err |= curl_easy_setopt(fp->easy, CURLOPT_FOLLOWLOCATION, 1L);
1167-
if (hts_verbose <= 8)
1191+
1192+
if (hts_verbose <= 8 && fp->headers.fail_on_error)
11681193
err |= curl_easy_setopt(fp->easy, CURLOPT_FAILONERROR, 1L);
11691194
if (hts_verbose >= 8)
11701195
err |= curl_easy_setopt(fp->easy, CURLOPT_VERBOSE, 1L);
11711196

1197+
if (fp->headers.redirect) {
1198+
err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERFUNCTION, header_callback);
1199+
err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERDATA, (void *)&in_header);
1200+
} else {
1201+
err |= curl_easy_setopt(fp->easy, CURLOPT_FOLLOWLOCATION, 1L);
1202+
}
1203+
11721204
if (err != 0) { errno = ENOSYS; goto error; }
11731205

11741206
errm = curl_multi_add_handle(fp->multi, fp->easy);
11751207
if (errm != CURLM_OK) { errno = multi_errno(errm); goto error; }
11761208
fp->nrunning++;
11771209

1178-
while (! fp->paused && ! fp->finished)
1210+
while (! fp->paused && ! fp->finished) {
11791211
if (wait_perform(fp) < 0) goto error_remove;
1212+
}
1213+
1214+
curl_easy_getinfo(fp->easy, CURLINFO_RESPONSE_CODE, &response);
1215+
if (fp->headers.http_response_ptr) {
1216+
*fp->headers.http_response_ptr = response;
1217+
}
11801218

11811219
if (fp->finished && fp->final_result != CURLE_OK) {
11821220
errno = easy_errno(fp->easy, fp->final_result);
11831221
goto error_remove;
11841222
}
11851223

1224+
if (fp->headers.redirect) {
1225+
if (response >= 300 && response < 400) { // redirection
1226+
kstring_t new_url = {0, 0, NULL};
1227+
1228+
if (fp->headers.redirect(fp->headers.redirect_data, response,
1229+
&in_header, &new_url)) {
1230+
errno = ENOSYS;
1231+
goto error;
1232+
}
1233+
1234+
err |= curl_easy_setopt(fp->easy, CURLOPT_URL, new_url.s);
1235+
err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERFUNCTION, NULL);
1236+
err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERDATA, NULL);
1237+
free(ks_release(&in_header));
1238+
1239+
if (err != 0) { errno = ENOSYS; goto error; }
1240+
free(ks_release(&new_url));
1241+
1242+
if (restart_from_position(fp, 0) < 0) {
1243+
goto error_remove;
1244+
}
1245+
1246+
if (fp->headers.http_response_ptr) {
1247+
curl_easy_getinfo(fp->easy, CURLINFO_RESPONSE_CODE,
1248+
fp->headers.http_response_ptr);
1249+
}
1250+
1251+
if (fp->finished && fp->final_result != CURLE_OK) {
1252+
errno = easy_errno(fp->easy, fp->final_result);
1253+
goto error_remove;
1254+
}
1255+
} else {
1256+
// we no longer need to look at the headers
1257+
err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERFUNCTION, NULL);
1258+
err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERDATA, NULL);
1259+
free(ks_release(&in_header));
1260+
1261+
if (err != 0) { errno = ENOSYS; goto error; }
1262+
}
1263+
}
1264+
11861265
if (mode == 'r') {
11871266
double dval;
1267+
11881268
if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD,
11891269
&dval) == CURLE_OK && dval >= 0.0)
11901270
fp->file_size = (off_t) (dval + 0.1);
@@ -1200,6 +1280,7 @@ libcurl_open(const char *url, const char *modes, http_headers *headers)
12001280
errno = save;
12011281

12021282
error:
1283+
if (fp->headers.redirect) free(in_header.s);
12031284
save = errno;
12041285
if (fp->easy) curl_easy_cleanup(fp->easy);
12051286
if (fp->multi) curl_multi_cleanup(fp->multi);
@@ -1266,6 +1347,18 @@ static int parse_va_list(http_headers *headers, va_list args)
12661347
if (strcmp(flag, "false") == 0)
12671348
headers->auth_hdr_num = -3;
12681349
}
1350+
else if (strcmp(argtype, "redirect_callback") == 0) {
1351+
headers->redirect = va_arg(args, const redirect_callback);
1352+
}
1353+
else if (strcmp(argtype, "redirect_callback_data") == 0) {
1354+
headers->redirect_data = va_arg(args, void *);
1355+
}
1356+
else if (strcmp(argtype, "http_response_ptr") == 0) {
1357+
headers->http_response_ptr = va_arg(args, long *);
1358+
}
1359+
else if (strcmp(argtype, "fail_on_error") == 0) {
1360+
headers->fail_on_error = va_arg(args, int);
1361+
}
12691362
else { errno = EINVAL; return -1; }
12701363

12711364
return 0;
@@ -1318,7 +1411,8 @@ static int parse_va_list(http_headers *headers, va_list args)
13181411
static hFILE *vhopen_libcurl(const char *url, const char *modes, va_list args)
13191412
{
13201413
hFILE *fp = NULL;
1321-
http_headers headers = { { NULL, 0, 0 }, { NULL, 0, 0 }, NULL, NULL };
1414+
http_headers headers = { .fail_on_error = 1 };
1415+
13221416
if (parse_va_list(&headers, args) == 0) {
13231417
fp = libcurl_open(url, modes, &headers);
13241418
}

0 commit comments

Comments
 (0)