#include <config.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "squat_internal.h"
#include "xmalloc.h"
struct _SquatSearchIndex {
int index_fd;
char const* data;
char const* doc_list;
char const* word_list;
char const* doc_ID_list;
char const* data_end;
unsigned char valid_char_bits[32];
};
static char bit_counts[256];
static int memconst(char const* s, int len, char v) {
while (len > 0 && *s == v) {
s++;
len--;
}
return len == 0;
}
SquatSearchIndex* squat_search_open(int fd) {
struct stat buf;
SquatSearchIndex* index;
SquatDiskHeader const* header;
SquatInt64 doc_list_offset, doc_ID_list_offset, word_list_offset;
SquatInt64 data_len;
squat_set_last_error(SQUAT_ERR_OK);
if (bit_counts[1] == 0) {
int c;
for (c = 1; c < 256; c++) {
bit_counts[c] = bit_counts[c >> 1] + (c & 1);
}
}
index = (SquatSearchIndex*)xmalloc(sizeof(SquatSearchIndex));
index->index_fd = fd;
if (fstat(fd, &buf) != 0) {
squat_set_last_error(SQUAT_ERR_SYSERR);
goto cleanup_index;
}
data_len = buf.st_size - SQUAT_SAFETY_ZONE;
if (data_len < sizeof(SquatDiskHeader)) {
squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
goto cleanup_index;
}
index->data = mmap(NULL, data_len + SQUAT_SAFETY_ZONE, PROT_READ, MAP_SHARED, fd, 0);
if (index->data == MAP_FAILED) {
squat_set_last_error(SQUAT_ERR_SYSERR);
goto cleanup_index;
}
header = (SquatDiskHeader const*)index->data;
doc_list_offset = squat_decode_64(header->doc_list_offset);
word_list_offset = squat_decode_64(header->word_list_offset);
doc_ID_list_offset = squat_decode_64(header->doc_ID_list_offset);
if (memcmp(header->header_text, squat_index_file_header, 8) != 0
|| doc_list_offset < 0 || doc_list_offset >= data_len
|| word_list_offset < 0 || word_list_offset >= data_len
|| doc_ID_list_offset < 0 || doc_ID_list_offset >= data_len
|| !memconst(index->data + data_len, SQUAT_SAFETY_ZONE, 0)) {
squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
goto cleanup_unmap;
}
index->doc_list = index->data + doc_list_offset;
index->word_list = index->data + word_list_offset;
index->doc_ID_list = index->data + doc_ID_list_offset;
index->data_end = index->data + data_len;
memcpy(index->valid_char_bits, header->valid_char_bits,
sizeof(index->valid_char_bits));
return index;
cleanup_unmap:
munmap((void*)index->data, data_len + SQUAT_SAFETY_ZONE);
cleanup_index:
free(index);
return NULL;
}
int squat_search_list_docs(SquatSearchIndex* index,
SquatListDocCallback handler, void* closure) {
char const* s = index->doc_list;
squat_set_last_error(SQUAT_ERR_OK);
while (*s != 0) {
SquatListDoc list_doc;
int r;
list_doc.doc_name = s;
s += strlen(s) + 1;
list_doc.size = squat_decode_I(&s);
r = handler(closure, &list_doc);
if (r == SQUAT_CALLBACK_ABORT) {
break;
}
assert(r == SQUAT_CALLBACK_CONTINUE);
}
return SQUAT_OK;
}
static char const* lookup_word_docs(SquatSearchIndex* index,
char const* data, int* invalid_file) {
int i;
char const* s = index->word_list;
for (i = 0; i < SQUAT_WORD_SIZE; i++) {
char p;
char ch = data[i];
char const* branch_start = s;
int skip;
p = *s++;
if ((p & 0xE0) != 0) {
if (ch != p) {
return NULL;
}
skip = 0;
} else {
char count;
char const* base;
int offset, j;
if ((unsigned char)ch < 8*p) {
return NULL;
}
count = (*s++) + 1;
if ((unsigned char)ch >= 8*(p + count)) {
return NULL;
}
offset = (unsigned char)ch/8 - p;
if ((s[offset] & (1 << (ch & 7))) == 0) {
return NULL;
}
base = s;
s += count;
skip = 0;
for (j = 0; j < offset; j++) {
skip += bit_counts[(unsigned char)base[j]];
}
for (j = 0; j < (ch & 7); j++) {
if ((base[offset] & (1 << j)) != 0) {
skip++;
}
}
}
if (i < SQUAT_WORD_SIZE - 1) {
int next_offset;
s = squat_decode_skip_I(s, skip);
next_offset = squat_decode_I(&s);
s = branch_start - next_offset;
if (next_offset < 0 || s >= index->data_end) {
*invalid_file = 1;
return NULL;
}
} else {
while (skip-- > 0) {
char const* t = s;
int v = (int)squat_decode_I(&t);
if ((v & 1) != 0) {
s = t;
} else {
s = t + (v >> 1);
}
}
}
}
return s;
}
static int count_docs_containing_word(SquatSearchIndex* index,
char const* data, char const** run_start) {
int invalid_file = 0;
char const* raw_doc_list = lookup_word_docs(index, data, &invalid_file);
int i;
if (raw_doc_list == NULL) {
return invalid_file ? -1 : 0;
}
*run_start = raw_doc_list;
i = (int)squat_decode_I(&raw_doc_list);
if ((i & 1) != 0) {
return 1;
} else {
int size = i >> 1;
char const* s = raw_doc_list;
int count = 0;
if (raw_doc_list + size >= index->data_end) {
return -1;
}
while (s - raw_doc_list < size) {
i = (int)squat_decode_I(&s);
if ((i & 1) == 1) {
count++;
} else {
count += i >> 1;
s = squat_decode_skip_I(s, 1);
}
}
if (raw_doc_list + size != s) {
return -1;
}
return count;
}
}
typedef struct {
int array_len;
int* array_data;
int index;
} SquatDocSet;
static int
set_to_docs_containing_word(SquatSearchIndex* index __attribute__((unused)),
SquatDocSet* set,
char const* data __attribute__((unused)),
int doc_count, char const* doc_list)
{
int i;
set->array_len = doc_count;
set->array_data = (int*)xmalloc(sizeof(int)*set->array_len);
i = (int)squat_decode_I(&doc_list);
if ((i & 1) != 0) {
set->array_data[0] = i >> 1;
} else {
int size = i >> 1;
char const* s = doc_list;
int last_doc = 0;
int j = 0;
while (s - doc_list < size) {
i = (int)squat_decode_I(&s);
if ((i & 1) == 1) {
last_doc = set->array_data[j++] = last_doc + (i >> 1);
} else {
int count = i >> 1;
int delta = squat_decode_I(&s);
last_doc += delta;
set->array_data[j++] = last_doc;
while (--count > 0) {
last_doc++;
set->array_data[j++] = last_doc;
}
}
}
}
return SQUAT_OK;
}
static void filter_doc(SquatDocSet* set, int doc) {
int i = set->index;
while (i < set->array_len && set->array_data[i] < doc) {
set->array_data[i] = -1;
i++;
}
if (i < set->array_len && set->array_data[i] == doc) {
i++;
}
set->index = i;
}
static void
filter_to_docs_containing_word(SquatSearchIndex* index __attribute__((unused)),
SquatDocSet* set,
char const* data __attribute__((unused)),
char const* doc_list)
{
int i = (int)squat_decode_I(&doc_list);
set->index = 0;
if ((i & 1) != 0) {
filter_doc(set, i >> 1);
} else {
int size = i >> 1;
char const* s = doc_list;
int last_doc = 0;
while (s - doc_list < size) {
i = (int)squat_decode_I(&s);
if ((i & 1) == 1) {
filter_doc(set, last_doc += i >> 1);
} else {
int count = i >> 1;
int delta = squat_decode_I(&s);
last_doc += delta;
filter_doc(set, last_doc);
while (--count > 0) {
last_doc++;
filter_doc(set, last_doc);
}
}
}
}
}
static void select_first_doc(SquatDocSet* set) {
set->index = 0;
while (set->index < set->array_len && set->array_data[set->index] < 0) {
set->index++;
}
}
static int has_more_docs(SquatDocSet* set) {
return set->index < set->array_len;
}
static int get_next_doc(SquatDocSet* set) {
int doc = set->array_data[set->index];
set->index++;
while (set->index < set->array_len && set->array_data[set->index] < 0) {
set->index++;
}
return doc;
}
static void destroy_docset(SquatDocSet* set) {
free(set->array_data);
}
int squat_search_execute(SquatSearchIndex* index, char const* data,
int data_len, SquatSearchResultCallback handler, void* closure) {
int i;
int min_doc_count_word;
int min_doc_count;
SquatDocSet set;
char const** run_starts;
if (data_len < SQUAT_WORD_SIZE) {
squat_set_last_error(SQUAT_ERR_SEARCH_STRING_TOO_SHORT);
return SQUAT_ERR;
}
for (i = 0; i < data_len; i++) {
int ch = (unsigned char)data[i];
if ((index->valid_char_bits[ch >> 3] & (1 << (ch & 7))) == 0) {
squat_set_last_error(SQUAT_ERR_SEARCH_STRING_INVALID_CHAR);
return SQUAT_ERR;
}
}
run_starts = (char const**)xmalloc(sizeof(char const*)*
(data_len - SQUAT_WORD_SIZE + 1));
squat_set_last_error(SQUAT_ERR_OK);
min_doc_count = count_docs_containing_word(index, data, run_starts);
if (min_doc_count < 0) {
squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
goto cleanup_run_starts;
} else if (min_doc_count == 0) {
goto cleanup_run_starts_ok;
}
min_doc_count_word = 0;
for (i = 1; i <= data_len - SQUAT_WORD_SIZE; i++) {
int doc_count = count_docs_containing_word(index, data + i,
run_starts + i);
if (doc_count < 0) {
squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
goto cleanup_run_starts;
} else if (doc_count == 0) {
goto cleanup_run_starts_ok;
} else if (doc_count < min_doc_count) {
min_doc_count = doc_count;
min_doc_count_word = i;
}
}
if (set_to_docs_containing_word(index, &set, data + min_doc_count_word,
min_doc_count, run_starts[min_doc_count_word]) == SQUAT_ERR) {
goto cleanup_run_starts;
}
for (i = 0; i <= data_len - SQUAT_WORD_SIZE; i++) {
if (i != min_doc_count_word) {
filter_to_docs_containing_word(index, &set, data + i, run_starts[i]);
}
}
select_first_doc(&set);
while (has_more_docs(&set)) {
int next_doc;
char const* next_doc_info;
char const* next_doc_data;
int r;
next_doc = get_next_doc(&set);
next_doc_info = index->doc_ID_list + next_doc*4;
if (next_doc < 0 && next_doc_info >= index->data_end) {
squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
goto cleanup_docset;
}
next_doc_data = index->doc_list + squat_decode_32(next_doc_info);
if (next_doc_data < index->doc_list || next_doc_data >= index->data_end) {
squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
goto cleanup_docset;
}
r = handler(closure, next_doc_data);
if (r == SQUAT_CALLBACK_ABORT) {
break;
}
assert(r == SQUAT_CALLBACK_CONTINUE);
}
destroy_docset(&set);
cleanup_run_starts_ok:
free(run_starts);
return SQUAT_OK;
cleanup_docset:
destroy_docset(&set);
cleanup_run_starts:
free(run_starts);
return SQUAT_ERR;
}
int squat_search_close(SquatSearchIndex* index) {
int r = SQUAT_OK;
squat_set_last_error(SQUAT_ERR_OK);
if (munmap((void*)index->data,
index->data_end + SQUAT_SAFETY_ZONE - index->data) != 0) {
squat_set_last_error(SQUAT_ERR_SYSERR);
r = SQUAT_ERR;
}
free(index);
return r;
}