archive_read_support_format_warc.c [plain text]
#include "archive_platform.h"
__FBSDID("$FreeBSD$");
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#ifdef HAVE_CTYPE_H
#include <ctype.h>
#endif
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include "archive.h"
#include "archive_entry.h"
#include "archive_private.h"
#include "archive_read_private.h"
typedef enum {
WT_NONE,
WT_INFO,
WT_META,
WT_RSRC,
WT_REQ,
WT_RSP,
WT_RVIS,
WT_CONV,
WT_CONT,
LAST_WT
} warc_type_t;
typedef struct {
size_t len;
const char *str;
} warc_string_t;
typedef struct {
size_t len;
char *str;
} warc_strbuf_t;
struct warc_s {
size_t cntlen;
size_t cntoff;
size_t unconsumed;
warc_strbuf_t pool;
unsigned int pver;
struct archive_string sver;
};
static int _warc_bid(struct archive_read *a, int);
static int _warc_cleanup(struct archive_read *a);
static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
static int _warc_skip(struct archive_read *a);
static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
static unsigned int _warc_rdver(const char buf[10], size_t bsz);
static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
static warc_string_t _warc_rduri(const char *buf, size_t bsz);
static ssize_t _warc_rdlen(const char *buf, size_t bsz);
static time_t _warc_rdrtm(const char *buf, size_t bsz);
static time_t _warc_rdmtm(const char *buf, size_t bsz);
static const char *_warc_find_eoh(const char *buf, size_t bsz);
static const char *_warc_find_eol(const char *buf, size_t bsz);
int
archive_read_support_format_warc(struct archive *_a)
{
struct archive_read *a = (struct archive_read *)_a;
struct warc_s *w;
int r;
archive_check_magic(_a, ARCHIVE_READ_MAGIC,
ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
if ((w = calloc(1, sizeof(*w))) == NULL) {
archive_set_error(&a->archive, ENOMEM,
"Can't allocate warc data");
return (ARCHIVE_FATAL);
}
r = __archive_read_register_format(
a, w, "warc",
_warc_bid, NULL, _warc_rdhdr, _warc_read,
_warc_skip, NULL, _warc_cleanup, NULL, NULL);
if (r != ARCHIVE_OK) {
free(w);
return (r);
}
return (ARCHIVE_OK);
}
static int
_warc_cleanup(struct archive_read *a)
{
struct warc_s *w = a->format->data;
if (w->pool.len > 0U) {
free(w->pool.str);
}
archive_string_free(&w->sver);
free(w);
a->format->data = NULL;
return (ARCHIVE_OK);
}
static int
_warc_bid(struct archive_read *a, int best_bid)
{
const char *hdr;
ssize_t nrd;
unsigned int ver;
(void)best_bid;
if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
return -1;
} else if (nrd < 12) {
return -1;
}
ver = _warc_rdver(hdr, nrd);
if (ver < 1200U || ver > 10000U) {
return -1;
}
return (64);
}
static int
_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
{
#define HDR_PROBE_LEN (12U)
struct warc_s *w = a->format->data;
unsigned int ver;
const char *buf;
ssize_t nrd;
const char *eoh;
warc_string_t fnam;
warc_type_t ftyp;
ssize_t cntlen;
time_t rtime;
time_t mtime;
start_over:
buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
if (nrd < 0) {
archive_set_error(
&a->archive, ARCHIVE_ERRNO_MISC,
"Bad record header");
return (ARCHIVE_FATAL);
} else if (buf == NULL) {
return (ARCHIVE_EOF);
}
eoh = _warc_find_eoh(buf, nrd);
if (eoh == NULL) {
archive_set_error(
&a->archive, ARCHIVE_ERRNO_MISC,
"Bad record header");
return (ARCHIVE_FATAL);
}
ver = _warc_rdver(buf, eoh - buf);
if (ver == 0U) {
archive_set_error(
&a->archive, ARCHIVE_ERRNO_MISC,
"Invalid record version");
return (ARCHIVE_FATAL);
} else if (ver < 1200U || ver > 10000U) {
archive_set_error(
&a->archive, ARCHIVE_ERRNO_MISC,
"Unsupported record version: %u.%u",
ver / 10000, (ver % 10000) / 100);
return (ARCHIVE_FATAL);
}
cntlen = _warc_rdlen(buf, eoh - buf);
if (cntlen < 0) {
archive_set_error(
&a->archive, EINVAL,
"Bad content length");
return (ARCHIVE_FATAL);
}
rtime = _warc_rdrtm(buf, eoh - buf);
if (rtime == (time_t)-1) {
archive_set_error(
&a->archive, EINVAL,
"Bad record time");
return (ARCHIVE_FATAL);
}
a->archive.archive_format = ARCHIVE_FORMAT_WARC;
if (ver != w->pver) {
archive_string_sprintf(&w->sver,
"WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
w->pver = ver;
}
ftyp = _warc_rdtyp(buf, eoh - buf);
w->cntlen = cntlen;
w->cntoff = 0U;
mtime = 0;
switch (ftyp) {
case WT_RSRC:
case WT_RSP:
fnam = _warc_rduri(buf, eoh - buf);
if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
fnam.len = 0U;
fnam.str = NULL;
break;
}
if (fnam.len + 1U > w->pool.len) {
w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
w->pool.str = realloc(w->pool.str, w->pool.len);
}
memcpy(w->pool.str, fnam.str, fnam.len);
w->pool.str[fnam.len] = '\0';
fnam.str = w->pool.str;
if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
mtime = rtime;
}
break;
default:
fnam.len = 0U;
fnam.str = NULL;
break;
}
__archive_read_consume(a, eoh - buf);
switch (ftyp) {
case WT_RSRC:
case WT_RSP:
if (fnam.len > 0U) {
archive_entry_set_filetype(entry, AE_IFREG);
archive_entry_copy_pathname(entry, fnam.str);
archive_entry_set_size(entry, cntlen);
archive_entry_set_perm(entry, 0644);
archive_entry_set_ctime(entry, rtime, 0L);
archive_entry_set_mtime(entry, mtime, 0L);
break;
}
default:
_warc_skip(a);
goto start_over;
}
return (ARCHIVE_OK);
}
static int
_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
{
struct warc_s *w = a->format->data;
const char *rab;
ssize_t nrd;
if (w->cntoff >= w->cntlen) {
eof:
*buf = NULL;
*bsz = 0U;
*off = w->cntoff + 4U;
w->unconsumed = 0U;
return (ARCHIVE_EOF);
}
rab = __archive_read_ahead(a, 1U, &nrd);
if (nrd < 0) {
*bsz = 0U;
return (int)nrd;
} else if (nrd == 0) {
goto eof;
} else if ((size_t)nrd > w->cntlen - w->cntoff) {
nrd = w->cntlen - w->cntoff;
}
*off = w->cntoff;
*bsz = nrd;
*buf = rab;
w->cntoff += nrd;
w->unconsumed = (size_t)nrd;
return (ARCHIVE_OK);
}
static int
_warc_skip(struct archive_read *a)
{
struct warc_s *w = a->format->data;
__archive_read_consume(a, w->cntlen + 4U);
w->cntlen = 0U;
w->cntoff = 0U;
return (ARCHIVE_OK);
}
static void*
deconst(const void *c)
{
return (char *)0x1 + (((const char *)c) - (const char *)0x1);
}
static char*
xmemmem(const char *hay, const size_t haysize,
const char *needle, const size_t needlesize)
{
const char *const eoh = hay + haysize;
const char *const eon = needle + needlesize;
const char *hp;
const char *np;
const char *cand;
unsigned int hsum;
unsigned int nsum;
unsigned int eqp;
if (needlesize == 0UL) {
return deconst(hay);
} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
return NULL;
}
for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
hp < eoh && np < eon;
hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
if (np < eon) {
return NULL;
} else if (eqp) {
return deconst(hay);
}
for (cand = hay; hp < eoh; hp++) {
hsum ^= *cand++;
hsum ^= *hp;
if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
return deconst(cand);
}
}
return NULL;
}
static int
strtoi_lim(const char *str, const char **ep, int llim, int ulim)
{
int res = 0;
const char *sp;
int rulim;
for (sp = str, rulim = ulim > 10 ? ulim : 10;
res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
sp++, rulim /= 10) {
res *= 10;
res += *sp - '0';
}
if (sp == str) {
res = -1;
} else if (res < llim || res > ulim) {
res = -2;
}
*ep = (const char*)sp;
return res;
}
static time_t
time_from_tm(struct tm *t)
{
#if HAVE_TIMEGM
return (timegm(t));
#elif HAVE__MKGMTIME64
return (_mkgmtime64(t));
#else
if (mktime(t) == (time_t)-1)
return ((time_t)-1);
return (t->tm_sec
+ t->tm_min * 60
+ t->tm_hour * 3600
+ t->tm_yday * 86400
+ (t->tm_year - 70) * 31536000
+ ((t->tm_year - 69) / 4) * 86400
- ((t->tm_year - 1) / 100) * 86400
+ ((t->tm_year + 299) / 400) * 86400);
#endif
}
static time_t
xstrpisotime(const char *s, char **endptr)
{
struct tm tm;
time_t res = (time_t)-1;
memset(&tm, 0, sizeof(tm));
while (*s == ' ' || *s == '\t')
++s;
if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
goto out;
}
if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
goto out;
}
if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
goto out;
}
if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
goto out;
}
if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
goto out;
}
if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
goto out;
}
tm.tm_year -= 1900;
tm.tm_mon--;
res = time_from_tm(&tm);
out:
if (endptr != NULL) {
*endptr = deconst(s);
}
return res;
}
static unsigned int
_warc_rdver(const char *buf, size_t bsz)
{
static const char magic[] = "WARC/";
const char *c;
unsigned int ver = 0U;
unsigned int end = 0U;
if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
return ver;
}
buf += sizeof(magic) - 1U;
if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
isdigit((unsigned char)buf[2U])) {
if (isdigit((unsigned char)buf[3U]))
end = 1U;
ver = (buf[0U] - '0') * 10000U;
if (end == 1U) {
ver += (buf[2U] - '0') * 1000U;
ver += (buf[3U] - '0') * 100U;
} else
ver += (buf[2U] - '0') * 100U;
c = buf + 3U + end;
if (ver >= 1200U) {
if (memcmp(c, "\r\n", 2U) != 0)
ver = 0U;
} else if (ver < 1200U) {
if (*c != ' ' && *c != '\t')
ver = 0U;
}
}
return ver;
}
static unsigned int
_warc_rdtyp(const char *buf, size_t bsz)
{
static const char _key[] = "\r\nWARC-Type:";
const char *val, *eol;
if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
return WT_NONE;
}
val += sizeof(_key) - 1U;
if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
return WT_NONE;
}
while (val < eol && (*val == ' ' || *val == '\t'))
++val;
if (val + 8U == eol) {
if (memcmp(val, "resource", 8U) == 0)
return WT_RSRC;
else if (memcmp(val, "response", 8U) == 0)
return WT_RSP;
}
return WT_NONE;
}
static warc_string_t
_warc_rduri(const char *buf, size_t bsz)
{
static const char _key[] = "\r\nWARC-Target-URI:";
const char *val, *uri, *eol, *p;
warc_string_t res = {0U, NULL};
if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
return res;
}
val += sizeof(_key) - 1U;
if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
return res;
}
while (val < eol && (*val == ' ' || *val == '\t'))
++val;
if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
return res;
}
for (p = val; p < eol; p++) {
if (isspace((unsigned char)*p))
return res;
}
if (uri < (val + 3U))
return res;
uri += 3U;
if (memcmp(val, "file", 4U) == 0) {
} else if (memcmp(val, "http", 4U) == 0 ||
memcmp(val, "ftp", 3U) == 0) {
while (uri < eol && *uri++ != '/');
} else {
return res;
}
res.str = uri;
res.len = eol - uri;
return res;
}
static ssize_t
_warc_rdlen(const char *buf, size_t bsz)
{
static const char _key[] = "\r\nContent-Length:";
const char *val, *eol;
char *on = NULL;
long int len;
if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
return -1;
}
val += sizeof(_key) - 1U;
if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
return -1;
}
while (val < eol && (*val == ' ' || *val == '\t'))
val++;
if (!isdigit((unsigned char)*val))
return -1;
len = strtol(val, &on, 10);
if (on != eol) {
return -1;
}
return (size_t)len;
}
static time_t
_warc_rdrtm(const char *buf, size_t bsz)
{
static const char _key[] = "\r\nWARC-Date:";
const char *val, *eol;
char *on = NULL;
time_t res;
if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
return (time_t)-1;
}
val += sizeof(_key) - 1U;
if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
return -1;
}
res = xstrpisotime(val, &on);
if (on != eol) {
return -1;
}
return res;
}
static time_t
_warc_rdmtm(const char *buf, size_t bsz)
{
static const char _key[] = "\r\nLast-Modified:";
const char *val, *eol;
char *on = NULL;
time_t res;
if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
return (time_t)-1;
}
val += sizeof(_key) - 1U;
if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
return -1;
}
res = xstrpisotime(val, &on);
if (on != eol) {
return -1;
}
return res;
}
static const char*
_warc_find_eoh(const char *buf, size_t bsz)
{
static const char _marker[] = "\r\n\r\n";
const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
if (hit != NULL) {
hit += sizeof(_marker) - 1U;
}
return hit;
}
static const char*
_warc_find_eol(const char *buf, size_t bsz)
{
static const char _marker[] = "\r\n";
const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
return hit;
}