archive_write_set_format_warc.c [plain text]
#include "archive_platform.h"
__FBSDID("$FreeBSD$");
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include "archive.h"
#include "archive_entry.h"
#include "archive_entry_locale.h"
#include "archive_private.h"
#include "archive_random_private.h"
#include "archive_write_private.h"
struct warc_s {
unsigned int omit_warcinfo:1;
time_t now;
mode_t typ;
unsigned int rng;
uint64_t populz;
};
static const char warcinfo[] =
"software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
"format: WARC file version 1.0\r\n";
typedef enum {
WT_NONE,
WT_INFO,
WT_META,
WT_RSRC,
WT_REQ,
WT_RSP,
WT_RVIS,
WT_CONV,
WT_CONT,
LAST_WT
} warc_type_t;
typedef struct {
warc_type_t type;
const char *tgturi;
const char *recid;
time_t rtime;
time_t mtime;
const char *cnttyp;
uint64_t cntlen;
} warc_essential_hdr_t;
typedef struct {
unsigned int u[4U];
} warc_uuid_t;
static int _warc_options(struct archive_write*, const char *key, const char *v);
static int _warc_header(struct archive_write *a, struct archive_entry *entry);
static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
static int _warc_finish_entry(struct archive_write *a);
static int _warc_close(struct archive_write *a);
static int _warc_free(struct archive_write *a);
static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
static int _gen_uuid(warc_uuid_t *tgt);
int
archive_write_set_format_warc(struct archive *_a)
{
struct archive_write *a = (struct archive_write *)_a;
struct warc_s *w;
archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
if (a->format_free != NULL) {
(a->format_free)(a);
}
w = malloc(sizeof(*w));
if (w == NULL) {
archive_set_error(&a->archive, ENOMEM,
"Can't allocate warc data");
return (ARCHIVE_FATAL);
}
w->omit_warcinfo = 0U;
w->now = time(NULL);
w->typ = 0;
w->rng = (unsigned int)w->now;
a->format_data = w;
a->format_name = "WARC/1.0";
a->format_options = _warc_options;
a->format_write_header = _warc_header;
a->format_write_data = _warc_data;
a->format_close = _warc_close;
a->format_free = _warc_free;
a->format_finish_entry = _warc_finish_entry;
a->archive.archive_format = ARCHIVE_FORMAT_WARC;
a->archive.archive_format_name = "WARC/1.0";
return (ARCHIVE_OK);
}
static int
_warc_options(struct archive_write *a, const char *key, const char *val)
{
struct warc_s *w = a->format_data;
if (strcmp(key, "omit-warcinfo") == 0) {
if (val == NULL || strcmp(val, "true") == 0) {
w->omit_warcinfo = 1U;
return (ARCHIVE_OK);
}
}
return (ARCHIVE_WARN);
}
static int
_warc_header(struct archive_write *a, struct archive_entry *entry)
{
struct warc_s *w = a->format_data;
struct archive_string hdr;
#define MAX_HDR_SIZE 512
if (!w->omit_warcinfo) {
ssize_t r;
warc_essential_hdr_t wi = {
WT_INFO,
NULL,
NULL,
0,
0,
"application/warc-fields",
sizeof(warcinfo) - 1U,
};
wi.rtime = w->now;
wi.mtime = w->now;
archive_string_init(&hdr);
r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
if (r >= 0) {
archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
archive_strncat(&hdr, "\r\n\r\n", 4);
__archive_write_output(a, hdr.s, archive_strlen(&hdr));
}
w->omit_warcinfo = 1U;
archive_string_free(&hdr);
}
if (archive_entry_pathname(entry) == NULL) {
archive_set_error(&a->archive, EINVAL,
"Invalid filename");
return (ARCHIVE_WARN);
}
w->typ = archive_entry_filetype(entry);
w->populz = 0U;
if (w->typ == AE_IFREG) {
warc_essential_hdr_t rh = {
WT_RSRC,
NULL,
NULL,
0,
0,
NULL,
0,
};
ssize_t r;
rh.tgturi = archive_entry_pathname(entry);
rh.rtime = w->now;
rh.mtime = archive_entry_mtime(entry);
rh.cntlen = (size_t)archive_entry_size(entry);
archive_string_init(&hdr);
r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
if (r < 0) {
archive_set_error(
&a->archive,
ARCHIVE_ERRNO_FILE_FORMAT,
"cannot archive file");
return (ARCHIVE_WARN);
}
__archive_write_output(a, hdr.s, r);
w->populz = rh.cntlen;
archive_string_free(&hdr);
return (ARCHIVE_OK);
}
archive_set_error(
&a->archive,
ARCHIVE_ERRNO_FILE_FORMAT,
"WARC can only process regular files");
return (ARCHIVE_FAILED);
}
static ssize_t
_warc_data(struct archive_write *a, const void *buf, size_t len)
{
struct warc_s *w = a->format_data;
if (w->typ == AE_IFREG) {
int rc;
if (len > w->populz) {
len = (size_t)w->populz;
}
rc = __archive_write_output(a, buf, len);
if (rc != ARCHIVE_OK) {
return rc;
}
}
return len;
}
static int
_warc_finish_entry(struct archive_write *a)
{
static const char _eor[] = "\r\n\r\n";
struct warc_s *w = a->format_data;
if (w->typ == AE_IFREG) {
int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
if (rc != ARCHIVE_OK) {
return rc;
}
}
w->typ = 0;
return (ARCHIVE_OK);
}
static int
_warc_close(struct archive_write *a)
{
(void)a;
return (ARCHIVE_OK);
}
static int
_warc_free(struct archive_write *a)
{
struct warc_s *w = a->format_data;
free(w);
a->format_data = NULL;
return (ARCHIVE_OK);
}
static void
xstrftime(struct archive_string *as, const char *fmt, time_t t)
{
struct tm *rt;
#if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S)
struct tm timeHere;
#endif
char strtime[100];
size_t len;
#ifdef HAVE_GMTIME_R
if ((rt = gmtime_r(&t, &timeHere)) == NULL)
return;
#elif defined(HAVE__GMTIME64_S)
_gmtime64_s(&timeHere, &t);
#else
if ((rt = gmtime(&t)) == NULL)
return;
#endif
len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
archive_strncat(as, strtime, len);
}
static ssize_t
_popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
{
static const char _ver[] = "WARC/1.0\r\n";
static const char * const _typ[LAST_WT] = {
NULL, "warcinfo", "metadata", "resource", NULL
};
char std_uuid[48U];
if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
return -1;
}
archive_strcpy(tgt, _ver);
archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
if (hdr.tgturi != NULL) {
static const char _uri[] = "";
static const char _fil[] = "file://";
const char *u;
char *chk = strchr(hdr.tgturi, ':');
if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
u = _uri;
} else {
u = _fil;
}
archive_string_sprintf(tgt,
"WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
}
xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
if (hdr.recid == NULL) {
warc_uuid_t u;
_gen_uuid(&u);
#if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
#define snprintf _snprintf
#endif
snprintf(
std_uuid, sizeof(std_uuid),
"<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
u.u[0U],
u.u[1U] >> 16U, u.u[1U] & 0xffffU,
u.u[2U] >> 16U, u.u[2U] & 0xffffU,
u.u[3U]);
hdr.recid = std_uuid;
}
archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
if (hdr.cnttyp != NULL) {
archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
}
archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
archive_strncat(tgt, "\r\n", 2);
return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
}
static int
_gen_uuid(warc_uuid_t *tgt)
{
archive_random(tgt->u, sizeof(tgt->u));
tgt->u[1U] &= 0xffff0fffU;
tgt->u[1U] |= 0x4000U;
tgt->u[2U] &= 0x3fffffffU;
tgt->u[2U] |= 0x80000000U;
return 0;
}