mbfilter_htmlent.c   [plain text]


/*
 * "streamable kanji code filter and converter"
 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
 *
 * LICENSE NOTICES
 *
 * This file is part of "streamable kanji code filter and converter",
 * which is distributed under the terms of GNU Lesser General Public 
 * License (version 2) as published by the Free Software Foundation.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with "streamable kanji code filter and converter";
 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
 * Suite 330, Boston, MA  02111-1307  USA
 *
 * The author of this part: Marcus Boerger <helly@php.net>
 *
 */
/*
 * The source code included in this files was separated from mbfilter.c
 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
 * 
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef HAVE_STRING_H
#include <string.h>
#endif

#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif

#include "mbfilter.h"
#include "mbfilter_htmlent.h"
#include "html_entities.h"

static const int htmlentitifieds[256] = {
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};

static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};

const mbfl_encoding mbfl_encoding_html_ent = {
	mbfl_no_encoding_html_ent,
	"HTML-ENTITIES",
	"HTML-ENTITIES",
	(const char *(*)[])&mbfl_encoding_html_ent_aliases,
	NULL,
	MBFL_ENCTYPE_ENC_STRM | MBFL_ENCTYPE_GL_UNSAFE
};

const struct mbfl_convert_vtbl vtbl_wchar_html = {
	mbfl_no_encoding_wchar,
	mbfl_no_encoding_html_ent,
	mbfl_filt_conv_common_ctor,
	mbfl_filt_conv_common_dtor,
	mbfl_filt_conv_html_enc,
	mbfl_filt_conv_html_enc_flush
};

const struct mbfl_convert_vtbl vtbl_html_wchar = {
	mbfl_no_encoding_html_ent,
	mbfl_no_encoding_wchar,
	mbfl_filt_conv_html_dec_ctor,
	mbfl_filt_conv_html_dec_dtor,
	mbfl_filt_conv_html_dec,
	mbfl_filt_conv_html_dec_flush };


#define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)

/*
 * any => HTML
 */
int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
{
	int tmp[64];
	int i;
	unsigned int uc;
	const mbfl_html_entity_entry *e;

	if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
				htmlentitifieds[c] != 1) {
		CK((*filter->output_function)(c, filter->data));
	} else {
 		CK((*filter->output_function)('&', filter->data));
		for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
			if (c == e->code) {
				char *p;
				
				for (p = e->name; *p != '\0'; p++) {
					CK((*filter->output_function)((int)*p, filter->data));
				}
				goto last;
			}
		}

		{
			int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);

			CK((*filter->output_function)('#', filter->data));

			uc = (unsigned int)c;

			*(--p) = '\0';
			do {
				*(--p) = "0123456789"[uc % 10];
				uc /= 10;
			} while (uc);

			for (; *p != '\0'; p++) {
				CK((*filter->output_function)(*p, filter->data));
			}
		}
	last:
		CK((*filter->output_function)(';', filter->data));
	}
	return c;
}

int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
{
	filter->status = 0;
	filter->opaque = NULL;

	if (filter->flush_function != NULL) {
		(*filter->flush_function)(filter->data);
	}

	return 0;
}

/*
 * HTML => any
 */
#define html_enc_buffer_size	16
static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
{
	filter->status = 0;
	filter->opaque = mbfl_malloc(html_enc_buffer_size+1);
}
	
void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
{
	filter->status = 0;
	if (filter->opaque)
	{
		mbfl_free((void*)filter->opaque);
	}
	filter->opaque = NULL;
}

int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
{
	int  pos, ent = 0;
	mbfl_html_entity_entry *entity;
	char *buffer = (char*)filter->opaque;

	if (!filter->status) {
		if (c == '&' ) {
			filter->status = 1;
			buffer[0] = '&';
		} else {
			CK((*filter->output_function)(c, filter->data));
		}
	} else {
		if (c == ';') {
			if (buffer[1]=='#') {
				if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
					if (filter->status > 3) {
						/* numeric entity */
						for (pos=3; pos<filter->status; pos++) {
							int v =  buffer[pos];
							if (v >= '0' && v <= '9') {
								v = v - '0';
							} else if (v >= 'A' && v <= 'F') {
								v = v - 'A' + 10;
							} else if (v >= 'a' && v <= 'f') {
								v = v - 'a' + 10;
							} else {
								ent = -1;
								break;
							}
							ent = ent * 16 + v;
						}
					} else {
						ent = -1;
					}
				} else {
					/* numeric entity */
					if (filter->status > 2) {
						for (pos=2; pos<filter->status; pos++) {
							int v = buffer[pos];
							if (v >= '0' && v <= '9') {
								v = v - '0';
							} else {
								ent = -1;
								break;
							}
							ent = ent*10 + v;
						}
					} else {
						ent = -1;
					}
				}
				if (ent >= 0 && ent < 0x110000) {
					CK((*filter->output_function)(ent, filter->data));
				} else {
					for (pos = 0; pos < filter->status; pos++) {
						CK((*filter->output_function)(buffer[pos], filter->data));
					}
					CK((*filter->output_function)(c, filter->data));
				}
				filter->status = 0;
				/*php_error_docref("ref.mbstring" TSRMLS_CC, E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
			} else {
				/* named entity */
				buffer[filter->status] = 0;
				entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
				while (entity->name) {
					if (!strcmp(buffer+1, entity->name))	{
						ent = entity->code;
						break;
					}
					entity++;
				}
				if (ent) {
					/* decoded */
					CK((*filter->output_function)(ent, filter->data));
					filter->status = 0;
					/*php_error_docref("ref.mbstring" TSRMLS_CC, E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
				} else { 
					/* failure */
					buffer[filter->status++] = ';';
					buffer[filter->status] = 0;
					/* php_error_docref("ref.mbstring" TSRMLS_CC, E_WARNING, "mbstring cannot decode '%s'", buffer); */
					mbfl_filt_conv_html_dec_flush(filter);
				}
			}
		} else {
			/* add character */
			buffer[filter->status++] = c;
			/* add character and check */
			if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
			{
				/* illegal character or end of buffer */
				if (c=='&')
					filter->status--;
				buffer[filter->status] = 0;
				/* php_error_docref("ref.mbstring" TSRMLS_CC, E_WARNING, "mbstring cannot decode '%s'", buffer)l */
				mbfl_filt_conv_html_dec_flush(filter);
				if (c=='&')
				{
					buffer[filter->status++] = '&';
				}
			}
		}
	}
	return c;
}

int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
{
	int status, pos = 0;
	unsigned char *buffer;
	int err = 0;

	buffer = (unsigned char*)filter->opaque;
	status = filter->status;
	filter->status = 0;

	/* flush fragments */
	while (status--) {
		int e = (*filter->output_function)(buffer[pos++], filter->data);
		if (e != 0)
			err = e;
	}

	if (filter->flush_function != NULL) {
		(*filter->flush_function)(filter->data);
	}

	return err;
}