#include <config.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <sys/mman.h>
#include "squat_internal.h"
#include "index.h"
#include "xmalloc.h"
typedef struct {
char* buf;
int buf_size;
int data_len;
int fd;
int total_output_bytes;
} SquatWriteBuffer;
static int init_write_buffer(SquatWriteBuffer* b, int buf_size, int fd) {
b->buf_size = buf_size;
b->buf = xmalloc(b->buf_size);
b->fd = fd;
b->data_len = 0;
b->total_output_bytes = 0;
return SQUAT_OK;
}
static char* prepare_buffered_write(SquatWriteBuffer* b, int len) {
if (b->data_len + len >= b->buf_size) {
if (write(b->fd, b->buf, b->data_len) != b->data_len) {
squat_set_last_error(SQUAT_ERR_SYSERR);
return NULL;
}
if (b->buf_size < len) {
b->buf = (char*)xrealloc(b->buf, len);
if (b->buf == NULL) {
squat_set_last_error(SQUAT_ERR_OUT_OF_MEMORY);
return NULL;
}
}
b->data_len = 0;
}
return b->buf + b->data_len;
}
static void complete_buffered_write(SquatWriteBuffer* b, char* ptr) {
int old_data_len = b->data_len;
b->data_len = ptr - b->buf;
b->total_output_bytes += b->data_len - old_data_len;
}
static int flush_and_reset_buffered_writes(SquatWriteBuffer* b) {
if (b->data_len > 0) {
if (write(b->fd, b->buf, b->data_len) != b->data_len) {
squat_set_last_error(SQUAT_ERR_SYSERR);
return SQUAT_ERR;
}
b->data_len = 0;
}
if (lseek(b->fd, 0, SEEK_SET) != 0) {
squat_set_last_error(SQUAT_ERR_SYSERR);
return SQUAT_ERR;
}
return SQUAT_OK;
}
typedef struct _WordDocEntry {
struct _WordDocEntry* next;
int doc_ID;
} WordDocEntry;
typedef struct {
short first_valid_entry;
short last_valid_entry;
WordDocEntry* docs[256];
} SquatWordTableLeafDocs;
typedef struct {
short first_valid_entry;
short last_valid_entry;
char presence[32];
} SquatWordTableLeafPresence;
typedef union _SquatWordTableEntry {
struct _SquatWordTable* table;
SquatWordTableLeafPresence* leaf_presence;
int leaf_presence_singleton;
SquatWordTableLeafDocs* leaf_docs;
} SquatWordTableEntry;
typedef struct _SquatWordTable {
short first_valid_entry;
short last_valid_entry;
SquatWordTableEntry entries[256];
} SquatWordTable;
struct _SquatIndex {
char* tmp_path;
SquatWriteBuffer out;
char* doc_ID_list;
int doc_ID_list_size;
int current_doc_ID;
int current_doc_len;
SquatWordTable *doc_word_table;
char runover_buf[SQUAT_WORD_SIZE];
int runover_len;
WordDocEntry* word_doc_allocator;
unsigned char valid_char_bits[32];
SquatStatsCallback stats_callback;
void* stats_callback_closure;
SquatWriteBuffer index_buffers[256];
int total_num_words[256];
int doc_words[256];
};
static void init_doc_word_table(SquatWordTable** t) {
SquatWordTable *ret = (SquatWordTable*)xmalloc(sizeof(SquatWordTable));
ret->first_valid_entry = 256;
ret->last_valid_entry = 0;
memset(ret->entries, 0, sizeof(ret->entries));
*t = ret;
}
SquatIndex* squat_index_init(int fd, SquatOptions const* options) {
SquatIndex* index;
int i;
int path_len;
char* buf;
char const* tmp_path;
squat_set_last_error(SQUAT_ERR_OK);
index = (SquatIndex*)xmalloc(sizeof(SquatIndex));
if (options != NULL && (options->option_mask & SQUAT_OPTION_TMP_PATH) != 0) {
tmp_path = options->tmp_path;
} else {
tmp_path = "/tmp";
}
path_len = strlen(tmp_path);
index->tmp_path = xmalloc(path_len + 1 + 12);
memcpy(index->tmp_path, tmp_path, path_len);
strcpy(index->tmp_path + path_len, "/squatXXXXXX");
if (options != NULL &&
(options->option_mask & SQUAT_OPTION_VALID_CHARS) != 0) {
int i;
memset(index->valid_char_bits, 0, sizeof(index->valid_char_bits));
for (i = 0; options->valid_chars[i] != 0; i++) {
int ch = (unsigned char)options->valid_chars[i];
index->valid_char_bits[ch >> 3] |= 1 << (ch & 7);
}
} else {
memset(index->valid_char_bits, 255, sizeof(index->valid_char_bits));
}
if (options != NULL &&
(options->option_mask & SQUAT_OPTION_STATISTICS) != 0) {
index->stats_callback = options->stats_callback;
index->stats_callback_closure = options->stats_callback_closure;
} else {
index->stats_callback = NULL;
}
for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
index->index_buffers[i].buf = NULL;
}
index->doc_ID_list_size = 1000;
index->doc_ID_list = (char*)xmalloc(index->doc_ID_list_size*sizeof(SquatInt32));
if (init_write_buffer(&index->out, 128*1024, fd) != SQUAT_OK) {
goto cleanup_doc_ID_list;
}
buf = prepare_buffered_write(&index->out, sizeof(SquatDiskHeader));
if (buf == NULL) {
goto cleanup_out_buffer;
}
memset(buf, 0, sizeof(SquatDiskHeader));
complete_buffered_write(&index->out, buf + sizeof(SquatDiskHeader));
index->current_doc_ID = 0;
init_doc_word_table(&index->doc_word_table);
memset(index->total_num_words, 0, sizeof(index->total_num_words));
return index;
cleanup_out_buffer:
free(index->out.buf);
cleanup_doc_ID_list:
free(index->doc_ID_list);
free(index->tmp_path);
free(index);
return NULL;
}
static int init_write_buffer_to_temp(SquatIndex* index, SquatWriteBuffer* b) {
int fd = mkstemp(index->tmp_path);
if (fd < 0) {
squat_set_last_error(SQUAT_ERR_SYSERR);
return SQUAT_ERR;
}
if (unlink(index->tmp_path) < 0) {
squat_set_last_error(SQUAT_ERR_SYSERR);
goto cleanup_fd;
}
strcpy(index->tmp_path + strlen(index->tmp_path) - 6, "XXXXXX");
if (init_write_buffer(b, 64*1024, fd) != SQUAT_OK) {
goto cleanup_fd;
}
return SQUAT_OK;
cleanup_fd:
close(b->fd);
return SQUAT_ERR;
}
int squat_index_open_document(SquatIndex* index, char const* name) {
int name_len;
char* buf;
squat_set_last_error(SQUAT_ERR_OK);
if (index->current_doc_ID >= index->doc_ID_list_size) {
index->doc_ID_list_size *= 2;
index->doc_ID_list =
(char*)xrealloc(index->doc_ID_list, index->doc_ID_list_size*sizeof(SquatInt32));
if (index->doc_ID_list == NULL) {
squat_set_last_error(SQUAT_ERR_OUT_OF_MEMORY);
return SQUAT_ERR;
}
}
squat_encode_32(index->doc_ID_list + index->current_doc_ID*4,
index->out.total_output_bytes - sizeof(SquatDiskHeader));
name_len = strlen(name) + 1;
if ((buf = prepare_buffered_write(&index->out, name_len)) == NULL) {
return SQUAT_ERR;
}
strcpy(buf, name);
complete_buffered_write(&index->out, buf + name_len);
index->current_doc_len = 0;
index->runover_len = 0;
memset(index->doc_words, 0, sizeof(index->doc_words));
return SQUAT_OK;
}
static void delete_doc_word_table(SquatWordTable* t, int depth) {
if (depth > 2) {
int i;
depth--;
for (i = 0; i < VECTOR_SIZE(t->entries); i++) {
SquatWordTableEntry* e = &(t->entries[i]);
if (e->table != NULL) {
delete_doc_word_table(e->table, depth);
}
}
} else {
int i;
for (i = 0; i < VECTOR_SIZE(t->entries); i++) {
SquatWordTableEntry* e = &(t->entries[i]);
if (e->leaf_presence != NULL && ((int)e->leaf_presence & 1) == 0) {
free(e->leaf_presence);
}
}
}
free(t);
}
#define SQUAT_ADD_NEW_WORD (SQUAT_LAST_BUILTIN + 1)
static int set_presence_bit(SquatWordTableLeafPresence* p, int ch) {
int mask = 1 << (ch & 7);
char* ptr = p->presence + (ch >> 3);
if (ch < p->first_valid_entry) {
p->first_valid_entry = ch;
}
if (ch > p->last_valid_entry) {
p->last_valid_entry = ch;
}
if ((*ptr & mask) == 0) {
*ptr |= mask;
return SQUAT_ADD_NEW_WORD;
} else {
return SQUAT_OK;
}
}
static int add_to_table(SquatIndex* index, char const* data, int data_len,
WordDocEntry* word_entry) {
SquatWordTable* t = index->doc_word_table;
int ch;
SquatWordTableEntry* e;
while (data_len > 2) {
ch = (unsigned char)data[0];
if (ch < t->first_valid_entry) {
t->first_valid_entry = ch;
}
if (ch > t->last_valid_entry) {
t->last_valid_entry = ch;
}
e = t->entries + ch;
t = e->table;
if (t == NULL) {
t = (SquatWordTable*)xmalloc(sizeof(SquatWordTable));
e->table = t;
t->first_valid_entry = 256;
t->last_valid_entry = 0;
memset(t->entries, 0, sizeof(t->entries));
}
data++;
data_len--;
}
ch = (unsigned char)data[0];
if (ch < t->first_valid_entry) {
t->first_valid_entry = ch;
}
if (ch > t->last_valid_entry) {
t->last_valid_entry = ch;
}
e = t->entries + ch;
ch = (unsigned char)data[1];
if (word_entry == NULL) {
if (((int)e->leaf_presence & 1) != 0) {
int oldch = e->leaf_presence_singleton >> 1;
if (oldch != ch) {
SquatWordTableLeafPresence* p;
p = (SquatWordTableLeafPresence*)
xmalloc(sizeof(SquatWordTableLeafPresence));
p->first_valid_entry = 256;
p->last_valid_entry = 0;
memset(p->presence, 0, sizeof(p->presence));
e->leaf_presence = p;
set_presence_bit(p, ch);
return set_presence_bit(p, oldch);
}
} else if (e->leaf_presence == NULL) {
e->leaf_presence = (void*)1;
e->leaf_presence_singleton = (ch << 1) | 1;
return SQUAT_ADD_NEW_WORD;
} else {
return set_presence_bit(e->leaf_presence, ch);
}
} else {
SquatWordTableLeafDocs* docs = e->leaf_docs;
WordDocEntry** entry_ptr;
if (docs == NULL) {
docs = (SquatWordTableLeafDocs*)
xmalloc(sizeof(SquatWordTableLeafDocs));
docs->first_valid_entry = 256;
docs->last_valid_entry = 0;
memset(docs->docs, 0, sizeof(docs->docs));
e->leaf_docs = docs;
}
entry_ptr = docs->docs + ch;
if (*entry_ptr == NULL) {
if (ch < docs->first_valid_entry) {
docs->first_valid_entry = ch;
}
if (ch > docs->last_valid_entry) {
docs->last_valid_entry = ch;
}
word_entry->next = word_entry;
*entry_ptr = word_entry;
return SQUAT_ADD_NEW_WORD;
} else {
word_entry->next = (*entry_ptr)->next;
(*entry_ptr)->next = word_entry;
*entry_ptr = word_entry;
}
}
return SQUAT_OK;
}
static int add_word_to_trie(SquatIndex* index, char const* word_ptr,
int doc_ID) {
WordDocEntry* word_entry = index->word_doc_allocator++;
word_entry->doc_ID = doc_ID;
add_to_table(index, word_ptr, SQUAT_WORD_SIZE - 1, word_entry);
return SQUAT_OK;
}
static int add_word_to_table(SquatIndex* index, char const* data) {
int r;
int i;
for (i = 0; i < SQUAT_WORD_SIZE; i++) {
int ch = (unsigned char)data[i];
if ((index->valid_char_bits[ch >> 3] & (1 << (ch & 7))) == 0) {
return SQUAT_OK;
}
}
r = add_to_table(index, data, SQUAT_WORD_SIZE, NULL);
if (r == SQUAT_ADD_NEW_WORD) {
index->doc_words[(unsigned char)data[0]]++;
return SQUAT_OK;
} else {
return r;
}
}
int squat_index_append_document(SquatIndex* index, char const* data,
int data_len) {
int i;
char buf[SQUAT_WORD_SIZE];
int new_runover;
int new_runover_data;
assert(data_len >= 0);
squat_set_last_error(SQUAT_ERR_OK);
if (data_len == 0) {
return SQUAT_OK;
}
for (i = 0; i < index->runover_len; i++) {
if (index->runover_len - i + data_len >= SQUAT_WORD_SIZE) {
memcpy(buf, index->runover_buf + i, index->runover_len - i);
memcpy(buf + index->runover_len - i, data,
SQUAT_WORD_SIZE - (index->runover_len - i));
if (add_word_to_table(index, buf) != SQUAT_OK) {
return SQUAT_ERR;
}
}
}
for (i = 0; i <= data_len - SQUAT_WORD_SIZE; i++) {
if (add_word_to_table(index, data + i) != SQUAT_OK) {
return SQUAT_ERR;
}
}
new_runover = index->runover_len + data_len;
if (new_runover > SQUAT_WORD_SIZE) {
new_runover = SQUAT_WORD_SIZE;
}
new_runover_data = data_len;
if (new_runover_data > new_runover) {
new_runover_data = new_runover;
}
memmove(index->runover_buf,
index->runover_buf + index->runover_len -
(new_runover - new_runover_data),
new_runover - new_runover_data);
memcpy(index->runover_buf + new_runover - new_runover_data,
data + data_len - new_runover_data, new_runover_data);
index->runover_len = new_runover;
index->current_doc_len += data_len;
return SQUAT_OK;
}
static int output_word(SquatWriteBuffer* b, char const* word) {
char* buf = prepare_buffered_write(b, SQUAT_WORD_SIZE - 1);
if (buf == NULL) {
return SQUAT_ERR;
}
memcpy(buf, word, SQUAT_WORD_SIZE - 1);
complete_buffered_write(b, buf + SQUAT_WORD_SIZE - 1);
return SQUAT_OK;
}
static int write_words(SquatIndex* index, SquatWriteBuffer* b,
SquatWordTable* t, int len, char* word) {
if (len == 2) {
int i;
for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
SquatWordTableEntry* e = t->entries + i;
word[0] = (char)i;
if (((int)e->leaf_presence & 1) != 0) {
word[1] = (char)(e->leaf_presence_singleton >> 1);
e->leaf_presence = NULL;
if (output_word(b, word - (SQUAT_WORD_SIZE - 3)) != SQUAT_OK) {
return SQUAT_ERR;
}
} else if (e->leaf_presence != NULL) {
SquatWordTableLeafPresence* p = e->leaf_presence;
int i;
int last_byte = p->last_valid_entry >> 3;
for (i = p->first_valid_entry >> 3; i <= last_byte; i++) {
if(i >= VECTOR_SIZE(p->presence)) {
return SQUAT_ERR;
} else {
int bits = (unsigned char)p->presence[i];
int j;
for (j = 0; bits > 0; j++, bits >>= 1) {
if ((bits & 1) != 0) {
word[1] = (char)(i*8 + j);
if (output_word(b, word - (SQUAT_WORD_SIZE - 3)) != SQUAT_OK) {
return SQUAT_ERR;
}
}
}
}
}
free(p);
e->leaf_presence = NULL;
}
}
} else {
int i;
for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
SquatWordTable* new_t = t->entries[i].table;
if (new_t != NULL) {
word[0] = (char)i;
if (write_words(index, b, new_t, len - 1, word + 1)
!= SQUAT_OK) {
return SQUAT_ERR;
}
}
}
}
t->first_valid_entry = 256;
t->last_valid_entry = 0;
return SQUAT_OK;
}
int squat_index_close_document(SquatIndex* index) {
char* buf;
int i;
squat_set_last_error(SQUAT_ERR_OK);
if ((buf = prepare_buffered_write(&index->out, 10)) == NULL) {
return SQUAT_ERR;
}
buf = squat_encode_I(buf, index->current_doc_len);
complete_buffered_write(&index->out, buf);
if (index->stats_callback != NULL) {
SquatStatsEvent event;
event.generic.type = SQUAT_STATS_COMPLETED_DOC;
event.completed_doc.num_unique_words = index->doc_words;
index->stats_callback(index->stats_callback_closure, &event);
}
for (i = 0; i < VECTOR_SIZE(index->doc_words); i++) {
if (index->doc_words[i] > 0) {
char* write_ptr;
char word_buf[SQUAT_WORD_SIZE - 1];
int cur_offset;
if (index->index_buffers[i].buf == NULL) {
if (init_write_buffer_to_temp(index, index->index_buffers + i)
!= SQUAT_OK) {
return SQUAT_ERR;
}
}
index->total_num_words[i] += index->doc_words[i];
write_ptr = prepare_buffered_write(index->index_buffers + i, 20);
if (write_ptr == NULL) {
return SQUAT_ERR;
}
write_ptr = squat_encode_I(write_ptr, index->current_doc_ID);
write_ptr = squat_encode_I(write_ptr, index->doc_words[i]);
complete_buffered_write(index->index_buffers + i, write_ptr);
cur_offset = index->index_buffers[i].total_output_bytes;
if (write_words(index, index->index_buffers + i,
index->doc_word_table->entries[i].table,
SQUAT_WORD_SIZE - 1, word_buf)
!= SQUAT_OK) {
return SQUAT_ERR;
}
assert(index->index_buffers[i].total_output_bytes - cur_offset
== (SQUAT_WORD_SIZE - 1)*index->doc_words[i]);
}
}
index->current_doc_len = -1;
index->current_doc_ID++;
return SQUAT_OK;
}
static int dump_word_table_offsets(SquatIndex* index, SquatWordTable* t,
int *offset_buf) {
int start_present = t->first_valid_entry;
int end_present = t->last_valid_entry;
char* buf;
int present_count;
if (start_present > end_present) {
if ((buf = prepare_buffered_write(&index->out, 2)) == NULL) {
return SQUAT_ERR;
} else {
buf[0] = buf[1] = 0;
complete_buffered_write(&index->out, buf + 2);
return SQUAT_OK;
}
}
if (end_present == start_present && end_present >= 32) {
if ((buf = prepare_buffered_write(&index->out, 1)) == NULL) {
return SQUAT_ERR;
} else {
*buf++ = (char)end_present;
present_count = 1;
}
} else {
int first_byte = start_present >> 3;
int byte_count = (end_present >> 3) - first_byte + 1;
if ((buf = prepare_buffered_write(&index->out, 2 + byte_count)) == NULL) {
return SQUAT_ERR;
} else {
int i;
*buf++ = (char)first_byte;
*buf++ = (char)byte_count - 1;
memset(buf, 0, byte_count);
present_count = 0;
for (i = start_present; i <= end_present; i++) {
if (offset_buf[i] > 0) {
present_count++;
buf[(i >> 3) - first_byte] |= 1 << (i & 7);
}
}
buf += byte_count;
}
}
complete_buffered_write(&index->out, buf);
if ((buf = prepare_buffered_write(&index->out, 10*present_count)) == NULL) {
return SQUAT_ERR;
} else {
int i;
for (i = start_present; i <= end_present; i++) {
int off = offset_buf[i];
if (off > 0) {
buf = squat_encode_I(buf, off);
}
}
}
complete_buffered_write(&index->out, buf);
return SQUAT_OK;
}
static int dump_doc_list_present_bits(SquatIndex* index,
SquatWordTableLeafDocs* docs) {
int start_present = docs->first_valid_entry;
int end_present = docs->last_valid_entry;
char* buf;
int present_count;
assert(start_present <= end_present);
if (end_present == start_present && end_present >= 32) {
if ((buf = prepare_buffered_write(&index->out, 1)) == NULL) {
return SQUAT_ERR;
} else {
*buf++ = (char)end_present;
present_count = 1;
}
} else {
int first_byte = start_present >> 3;
int byte_count = (end_present >> 3) - first_byte + 1;
if ((buf = prepare_buffered_write(&index->out, 2 + byte_count)) == NULL) {
return SQUAT_ERR;
} else {
int i;
*buf++ = (char)first_byte;
*buf++ = (char)byte_count - 1;
memset(buf, 0, byte_count);
present_count = 0;
for (i = start_present; i <= end_present; i++) {
if (docs->docs[i] != NULL) {
present_count++;
buf[(i >> 3) - first_byte] |= 1 << (i & 7);
}
}
buf += byte_count;
}
}
complete_buffered_write(&index->out, buf);
return SQUAT_OK;
}
static int dump_doc_list_docs(SquatIndex* index,
SquatWordTableLeafDocs* docs) {
int i;
WordDocEntry** doc_list = docs->docs;
for (i = docs->first_valid_entry; i <= docs->last_valid_entry; i++) {
if (doc_list[i] != NULL) {
WordDocEntry* first_doc;
WordDocEntry* doc;
int run_size = 0;
int last_doc_ID;
int run_seq_delta = 0;
int run_seq_count;
int doc_count = 0;
char* buf;
doc = first_doc = doc_list[i]->next;
last_doc_ID = 0;
run_seq_count = 0;
do {
if (doc->doc_ID == last_doc_ID + 1 && run_seq_count > 0) {
run_seq_count++;
} else {
if (run_seq_count > 0) {
if (run_seq_count > 1) {
run_size += squat_count_encode_I(run_seq_count << 1)
+ squat_count_encode_I(run_seq_delta);
} else {
run_size += squat_count_encode_I((run_seq_delta << 1) | 1);
}
}
run_seq_count = 1;
run_seq_delta = doc->doc_ID - last_doc_ID;
}
last_doc_ID = doc->doc_ID;
doc = doc->next;
doc_count++;
} while (doc != first_doc);
if (run_seq_count > 0) {
if (run_seq_count > 1) {
run_size += squat_count_encode_I(run_seq_count << 1)
+ squat_count_encode_I(run_seq_delta);
} else {
run_size += squat_count_encode_I((run_seq_delta << 1) | 1);
}
}
if ((buf = prepare_buffered_write(&index->out, 10 + run_size))
== NULL) {
return SQUAT_ERR;
}
if (doc_count == 1) {
buf = squat_encode_I(buf, (doc->doc_ID << 1) | 1);
} else {
buf = squat_encode_I(buf, run_size << 1);
last_doc_ID = 0;
run_seq_count = 0;
do {
if (doc->doc_ID == last_doc_ID + 1 && run_seq_count > 0) {
run_seq_count++;
} else {
if (run_seq_count > 0) {
if (run_seq_count > 1) {
buf = squat_encode_I(buf, run_seq_count << 1);
buf = squat_encode_I(buf, run_seq_delta);
} else {
buf = squat_encode_I(buf, (run_seq_delta << 1) | 1);
}
}
run_seq_count = 1;
run_seq_delta = doc->doc_ID - last_doc_ID;
}
last_doc_ID = doc->doc_ID;
doc = doc->next;
} while (doc != first_doc);
if (run_seq_count > 0) {
if (run_seq_count > 1) {
buf = squat_encode_I(buf, run_seq_count << 1);
buf = squat_encode_I(buf, run_seq_delta);
} else {
buf = squat_encode_I(buf, (run_seq_delta << 1) | 1);
}
}
}
complete_buffered_write(&index->out, buf);
}
}
return SQUAT_OK;
}
static int write_trie_word_data(SquatIndex* index, SquatWordTable* t, int len,
int* result_offset) {
int i;
int offsets[256];
int off;
SquatWordTableEntry* entries = t->entries;
int r;
memset(offsets, 0, t->first_valid_entry*sizeof(int));
if (len > 2) {
for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
SquatWordTable* new_t = entries[i].table;
if (new_t != NULL) {
if (write_trie_word_data(index, new_t, len - 1, offsets + i)
!= SQUAT_OK) {
return SQUAT_ERR;
}
} else {
offsets[i] = 0;
}
}
} else {
for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
SquatWordTableLeafDocs* leaf_docs = entries[i].leaf_docs;
if (leaf_docs != NULL) {
offsets[i] = index->out.total_output_bytes;
if (dump_doc_list_present_bits(index, leaf_docs) != SQUAT_OK
|| dump_doc_list_docs(index, leaf_docs) != SQUAT_OK) {
return SQUAT_ERR;
}
free(entries[i].leaf_docs);
entries[i].leaf_docs = NULL;
} else {
offsets[i] = 0;
}
}
}
memset(offsets + i, 0, (256 - i)*sizeof(int));
*result_offset = off = index->out.total_output_bytes;
for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
if (offsets[i] != 0) {
offsets[i] = off - offsets[i];
}
}
r = dump_word_table_offsets(index, t, offsets);
t->first_valid_entry = 256;
t->last_valid_entry = 0;
return r;
}
static int dump_index_trie_words(SquatIndex* index, int first_char,
int* result_offset) {
SquatWriteBuffer* buf = index->index_buffers + first_char;
int num_words = index->total_num_words[first_char];
WordDocEntry* doc_table;
char const* word_list_ptr;
int r = SQUAT_OK;
char const* word_ptr;
doc_table = (WordDocEntry*)xmalloc(sizeof(WordDocEntry)*num_words);
index->word_doc_allocator = doc_table;
word_list_ptr = mmap(NULL, buf->total_output_bytes, PROT_READ, MAP_SHARED,
buf->fd, 0);
if (word_list_ptr == MAP_FAILED) {
squat_set_last_error(SQUAT_ERR_SYSERR);
r = SQUAT_ERR;
goto cleanup;
}
word_ptr = word_list_ptr;
while (num_words > 0) {
int doc_ID = (int)squat_decode_I(&word_ptr);
int doc_words = (int)squat_decode_I(&word_ptr);
num_words -= doc_words;
while (doc_words > 0) {
if (add_word_to_trie(index, word_ptr, doc_ID)
!= SQUAT_OK) {
r = SQUAT_ERR;
goto cleanup_map;
}
word_ptr += SQUAT_WORD_SIZE - 1;
doc_words--;
}
}
assert(index->word_doc_allocator - doc_table
== index->total_num_words[first_char]);
assert(word_ptr - word_list_ptr == buf->total_output_bytes);
r = write_trie_word_data(index, index->doc_word_table,
SQUAT_WORD_SIZE - 1, result_offset);
cleanup_map:
if (munmap((void*)word_list_ptr, buf->total_output_bytes) != 0
&& r == SQUAT_OK) {
squat_set_last_error(SQUAT_ERR_SYSERR);
r = SQUAT_ERR;
}
cleanup:
free(doc_table);
return r;
}
static int index_close_internal(SquatIndex* index, int OK) {
int r = SQUAT_OK;
int doc_list_offset;
int doc_ID_list_offset;
int word_list_offset;
char* buf;
int i;
SquatDiskHeader* header;
int offset_buf[256];
squat_set_last_error(SQUAT_ERR_OK);
if (!OK) {
goto cleanup;
}
if (index->current_doc_len >= 0) {
squat_index_close_document(index);
}
delete_doc_word_table(index->doc_word_table, SQUAT_WORD_SIZE);
init_doc_word_table(&index->doc_word_table);
doc_list_offset = sizeof(SquatDiskHeader);
doc_ID_list_offset = index->out.total_output_bytes + 1;
if ((buf = prepare_buffered_write(&index->out,
SQUAT_SAFETY_ZONE + ((index->current_doc_ID + 1)*4))) == NULL) {
r = SQUAT_ERR;
goto cleanup;
}
*buf++ = 0;
memcpy(buf, index->doc_ID_list, index->current_doc_ID*4);
buf += index->current_doc_ID*4;
memset(buf, 0, 4);
complete_buffered_write(&index->out, buf + 4);
memset(offset_buf, 0, sizeof(offset_buf));
for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
if (index->stats_callback != NULL) {
SquatStatsEvent event;
event.generic.type = SQUAT_STATS_COMPLETED_INITIAL_CHAR;
event.completed_initial_char.completed_char = i;
event.completed_initial_char.num_words = index->total_num_words[i];
if (index->index_buffers[i].buf != NULL) {
event.completed_initial_char.temp_file_size =
index->index_buffers[i].total_output_bytes;
} else {
event.completed_initial_char.temp_file_size = 0;
}
index->stats_callback(index->stats_callback_closure, &event);
}
if (index->index_buffers[i].buf != NULL) {
if (flush_and_reset_buffered_writes(index->index_buffers + i) != SQUAT_OK
|| dump_index_trie_words(index, i, offset_buf + i) != SQUAT_OK) {
r = SQUAT_ERR;
goto cleanup;
}
if (close(index->index_buffers[i].fd) < 0) {
squat_set_last_error(SQUAT_ERR_SYSERR);
r = SQUAT_ERR;
}
free(index->index_buffers[i].buf);
index->index_buffers[i].buf = NULL;
} else {
offset_buf[i] = 0;
}
}
word_list_offset = index->out.total_output_bytes;
for (i = 0; i < VECTOR_SIZE(offset_buf); i++) {
if (offset_buf[i] != 0) {
offset_buf[i] = word_list_offset - offset_buf[i];
if (i < index->doc_word_table->first_valid_entry) {
index->doc_word_table->first_valid_entry = i;
}
index->doc_word_table->last_valid_entry = i;
}
}
if (dump_word_table_offsets(index, index->doc_word_table, offset_buf)
!= SQUAT_OK) {
r = SQUAT_ERR;
goto cleanup;
}
if ((buf = prepare_buffered_write(&index->out, SQUAT_SAFETY_ZONE)) == NULL) {
r = SQUAT_ERR;
goto cleanup;
}
memset(buf, 0, SQUAT_SAFETY_ZONE);
complete_buffered_write(&index->out, buf + SQUAT_SAFETY_ZONE);
if (flush_and_reset_buffered_writes(&index->out) != SQUAT_OK) {
r = SQUAT_ERR;
goto cleanup;
}
if ((header = (SquatDiskHeader*)prepare_buffered_write(&index->out,
sizeof(SquatDiskHeader))) == NULL) {
r = SQUAT_ERR;
goto cleanup;
}
memcpy(header->header_text, squat_index_file_header, 8);
squat_encode_64(header->doc_list_offset, doc_list_offset);
squat_encode_64(header->doc_ID_list_offset, doc_ID_list_offset);
squat_encode_64(header->word_list_offset, word_list_offset);
memcpy(header->valid_char_bits, index->valid_char_bits,
sizeof(header->valid_char_bits));
complete_buffered_write(&index->out, (char*)(header + 1));
if (flush_and_reset_buffered_writes(&index->out) != SQUAT_OK) {
r = SQUAT_ERR;
goto cleanup;
}
cleanup:
free(index->out.buf);
delete_doc_word_table(index->doc_word_table, SQUAT_WORD_SIZE - 1);
for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
if (index->index_buffers[i].buf != NULL) {
close(index->index_buffers[i].fd);
free(index->index_buffers[i].buf);
}
}
free(index->tmp_path);
free(index->doc_ID_list);
free(index);
return r;
}
int squat_index_finish(SquatIndex* index) {
return index_close_internal(index, 1);
}
int squat_index_destroy(SquatIndex* index) {
return index_close_internal(index, 0);
}