ucbuf.h   [plain text]


/*
*******************************************************************************
*
*   Copyright (C) 1998-2008, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*
* File ucbuf.c
*
* Modification History:
*
*   Date        Name        Description
*   05/10/01    Ram         Creation.
*
* This API reads in files and returns UChars
*******************************************************************************
*/

#include "unicode/ucnv.h"
#include "filestrm.h"

#if !UCONFIG_NO_CONVERSION

#ifndef UCBUF_H
#define UCBUF_H 1

typedef struct UCHARBUF UCHARBUF;
/**
 * End of file value
 */
#define U_EOF 0xFFFFFFFF
/**
 * Error value if a sequence cannot be unescaped
 */
#define U_ERR 0xFFFFFFFE

typedef struct ULine ULine;

struct  ULine {
    UChar     *name;
    int32_t   len;
};

/**
 * Opens the UCHARBUF with the given file stream and code page for conversion
 * @param fileName  Name of the file to open.
 * @param codepage  The encoding of the file stream to convert to Unicode. 
 *                  If *codepoge is NULL on input the API will try to autodetect
 *                  popular Unicode encodings
 * @param showWarning Flag to print out warnings to STDOUT
 * @param buffered  If TRUE performs a buffered read of the input file. If FALSE reads 
 *                  the whole file into memory and converts it.
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 * @return pointer to the newly opened UCHARBUF
 */
U_CAPI UCHARBUF* U_EXPORT2
ucbuf_open(const char* fileName,const char** codepage,UBool showWarning, UBool buffered, UErrorCode* err);

/**
 * Gets a UTF-16 code unit at the current position from the converted buffer
 * and increments the current position
 * @param buf Pointer to UCHARBUF structure
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 */
U_CAPI int32_t U_EXPORT2
ucbuf_getc(UCHARBUF* buf,UErrorCode* err);

/**
 * Gets a UTF-32 code point at the current position from the converted buffer
 * and increments the current position
 * @param buf Pointer to UCHARBUF structure
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 */
U_CAPI int32_t U_EXPORT2
ucbuf_getc32(UCHARBUF* buf,UErrorCode* err);

/**
 * Gets a UTF-16 code unit at the current position from the converted buffer after 
 * unescaping and increments the current position. If the escape sequence is for UTF-32
 * code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned
 * @param buf Pointer to UCHARBUF structure
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 */
U_CAPI int32_t U_EXPORT2
ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err);

/**
 * Gets a pointer to the current position in the internal buffer and length of the line.
 * It imperative to make a copy of the returned buffere before performing operations on it.
 * @param buf Pointer to UCHARBUF structure
 * @param len Output param to receive the len of the buffer returned till end of the line
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 *        Error: U_TRUNCATED_CHAR_FOUND
 * @return Pointer to the internal buffer, NULL if EOF
 */
U_CAPI const UChar* U_EXPORT2
ucbuf_readline(UCHARBUF* buf,int32_t* len, UErrorCode* err);


/**
 * Resets the buffers and the underlying file stream.
 * @param buf Pointer to UCHARBUF structure
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 */
U_CAPI void U_EXPORT2
ucbuf_rewind(UCHARBUF* buf,UErrorCode* err);

/**
 * Returns a pointer to the internal converted buffer
 * @param buf Pointer to UCHARBUF structure
 * @param len Pointer to int32_t to receive the lenth of buffer
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 * @return Pointer to internal UChar buffer
 */
U_CAPI const UChar* U_EXPORT2
ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* err);

/**
 * Closes the UCHARBUF structure members and cleans up the malloc'ed memory
 * @param buf Pointer to UCHARBUF structure
 */
U_CAPI void U_EXPORT2
ucbuf_close(UCHARBUF* buf);

/**
 * Rewinds the buffer by one codepoint. Does not rewind over escaped characters.
 */
U_CAPI void U_EXPORT2
ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf);


/**
 * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. 
 * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
 * the converter to correct state for converting the rest of the stream. So the UConverter parameter
 * is necessary.
 * If the charset was autodetected, the caller must close both the input FileStream
 * and the converter.
 *
 * @param fileName The file name to be opened and encoding autodected
 * @param conv  Output param to receive the opened converter if autodetected; NULL otherwise.
 * @param cp Output param to receive the detected encoding
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 * @return The input FileStream if its charset was autodetected; NULL otherwise.
 */
U_CAPI FileStream * U_EXPORT2
ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv,
int32_t* signatureLength, UErrorCode* status);

/**
 * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. 
 * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
 * the converter to correct state for converting the rest of the stream. So the UConverter parameter
 * is necessary.
 * If the charset was autodetected, the caller must close the converter.
 *
 * @param fileStream The file stream whose encoding is to be detected
 * @param conv  Output param to receive the opened converter if autodetected; NULL otherwise.
 * @param cp Output param to receive the detected encoding
 * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
 *        indicates a failure on entry, the function will immediately return.
 *        On exit the value will indicate the success of the operation.
 * @return Boolean whether the Unicode charset was autodetected.
 */

U_CAPI UBool U_EXPORT2
ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* status);

/**
 * Returns the approximate size in UChars required for converting the file to UChars
 */
U_CAPI int32_t U_EXPORT2
ucbuf_size(UCHARBUF* buf);

U_CAPI const char* U_EXPORT2
ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status);

#endif
#endif