ugrep.cpp   [plain text]


/**************************************************************************
*
*   Copyright (C) 2002-2010, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
***************************************************************************
*/

//
//   ugrep  - an ICU sample program illustrating the use of ICU Regular Expressions.
//
//            The use of the ICU Regex API all occurs within the main()
//            function.  The rest of the code deals with with opening files,
//            encoding conversions, printing results, etc.
//
//            This is not a full-featured grep program.  The command line options
//            have been kept to a minimum to avoid complicating the sample code.
//



#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/regex.h"
#include "unicode/ucnv.h"
#include "unicode/uclean.h"


//
//  The following variables contain paramters that may be set from the command line.
//
const char *pattern = NULL;     // The regular expression
int        firstFileNum;        //  argv index of the first file name
UBool      displayFileName = FALSE;
UBool      displayLineNum  = FALSE;


//
//  Info regarding the file currently being processed
//
const char *fileName;      
int         fileLen;              // Length, in UTF-16 Code Units.  

UChar      *ucharBuf = 0;         // Buffer, holds converted file.  (Simple minded program, always reads
                                  //   the whole file at once.

char       *charBuf = 0;          // Buffer, for original, unconverted file data.


//
//  Info regarding the line currently being processed
//
int      lineStart;     // Index of first char of the current line in the file buffer
int      lineEnd;       // Index of char following the new line sequence for the current line
int      lineNum;

//
//  Converter, used on output to convert Unicode data back to char *
//             so that it will display in non-Unicode terminal windows.
//
UConverter  *outConverter = 0;

//
//  Function forward declarations
//
void processOptions(int argc, const char **argv);
void nextLine(int start);
void printMatch();
void printUsage();
void readFile(const char *name);



//------------------------------------------------------------------------------------------
//
//   main          for ugrep
//
//           Structurally, all use of the ICU Regular Expression API is in main(),
//           and all of the supporting stuff necessary to make a running program, but
//           not directly related to regular expressions, is factored out into these other
//           functions.
//
//------------------------------------------------------------------------------------------
int main(int argc, const char** argv) {
    UBool     matchFound = FALSE;

    //
    //  Process the commmand line options.
    //
    processOptions(argc, argv);

    //
    // Create a RegexPattern object from the user supplied pattern string.
    //
    UErrorCode status = U_ZERO_ERROR;   // All ICU operations report success or failure
                                        //   in a status variable.

    UParseError    parseErr;            // In the event of a syntax error in the regex pattern,
                                        //   this struct will contain the position of the
                                        //   error.

    RegexPattern  *rePat = RegexPattern::compile(pattern, parseErr, status);
                                        // Note that C++ is doing an automatic conversion
                                        //  of the (char *) pattern to a temporary
                                        //  UnicodeString object.
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep:  error in pattern: \"%s\" at position %d\n",
            u_errorName(status), parseErr.offset);
        exit(-1);
    }

    //
    // Create a RegexMatcher from the newly created pattern.
    //
    UnicodeString empty;
    RegexMatcher *matcher = rePat->matcher(empty, status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep:  error in creating RegexMatcher: \"%s\"\n",
            u_errorName(status));
        exit(-1);
    }

    //
    // Loop, processing each of the input files.
    //
    for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
        readFile(argv[fileNum]);

        //
        //  Loop through the lines of a file, trying to match the regex pattern on each.
        //
        for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
            UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
            matcher->reset(s);
            if (matcher->find()) {
                matchFound = TRUE;
                printMatch();
            }
        }
    }

    //
    //  Clean up
    //
    delete matcher;
    delete rePat;
    free(ucharBuf);
    free(charBuf);
    ucnv_close(outConverter);
    
    u_cleanup();       // shut down ICU, release any cached data it owns.

    return matchFound? 0: 1;
}



//------------------------------------------------------------------------------------------
//
//   doOptions          Run through the command line options, and set
//                      the global variables accordingly.
//
//                      exit without returning if an error occured and
//                      ugrep should not proceed further.
//
//------------------------------------------------------------------------------------------
void processOptions(int argc, const char **argv) {
    int            optInd;
    UBool          doUsage   = FALSE;
    UBool          doVersion = FALSE;
    const char    *arg;


    for(optInd = 1; optInd < argc; ++optInd) {
        arg = argv[optInd];
        
        /* version info */
        if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
            doVersion = TRUE;
        }
        /* usage info */
        else if(strcmp(arg, "--help") == 0) {
            doUsage = TRUE;
        }
        else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
            displayLineNum = TRUE;
        }
        /* POSIX.1 says all arguments after -- are not options */
        else if(strcmp(arg, "--") == 0) {
            /* skip the -- */
            ++optInd;
            break;
        }
        /* unrecognized option */
        else if(strncmp(arg, "-", strlen("-")) == 0) {
            printf("ugrep: invalid option -- %s\n", arg+1);
            doUsage = TRUE;
        }
        /* done with options */
        else {
            break;
        }
    }

    if (doUsage) {
        printUsage();
        exit(0);
    }

    if (doVersion) {
        printf("ugrep version 0.01\n");
        if (optInd == argc) {
            exit(0);
        }
    }

    int  remainingArgs = argc-optInd;     // pattern file ...
    if (remainingArgs < 2) {
        fprintf(stderr, "ugrep:  files or pattern are missing.\n");
        printUsage();
        exit(1);
    }

    if (remainingArgs > 2) {
        // More than one file to be processed.   Display file names with match output.
        displayFileName = TRUE;
    }

    pattern      = argv[optInd];
    firstFileNum = optInd+1;
}

//------------------------------------------------------------------------------------------
//
//   printUsage
//
//------------------------------------------------------------------------------------------
void printUsage() {
    printf("ugrep [options] pattern file...\n"
        "     -V or --version     display version information\n"
        "     --help              display this help and exit\n"
        "     --                  stop further option processing\n"
        "-n,  --line-number       Prefix each line of output with the line number within its input file.\n"
        );
    exit(0);
}

//------------------------------------------------------------------------------------------
//
//    readFile          Read a file into memory, and convert it to Unicode.
//
//                      Since this is just a demo program, take the simple minded approach
//                      of always reading the whole file at once.  No intelligent buffering
//                      is done.
//
//------------------------------------------------------------------------------------------
void readFile(const char *name) {

    //
    //  Initialize global file variables
    //
    fileName = name;
    fileLen  = 0;      // zero length prevents processing in case of errors.


    //
    //  Open the file and determine its size.
    //
    FILE *file = fopen(name, "rb");
    if (file == 0 ) {
        fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
        return;
    }
    fseek(file, 0, SEEK_END);
    int rawFileLen = ftell(file);
    fseek(file, 0, SEEK_SET);
    

    //
    //   Read in the file
    //
    charBuf    = (char *)realloc(charBuf, rawFileLen+1);   // Need error checking...
    int t = fread(charBuf, 1, rawFileLen, file);
    if (t != rawFileLen)  {
        fprintf(stderr, "Error reading file \"%s\"\n", fileName);
        fclose(file);
        return;
    }
    charBuf[rawFileLen]=0;
    fclose(file);

    //
    // Look for a Unicode Signature (BOM) in the data
    //
    int32_t        signatureLength;
    const char *   charDataStart = charBuf;
    UErrorCode     status        = U_ZERO_ERROR;
    const char*    encoding      = ucnv_detectUnicodeSignature(
                           charDataStart, rawFileLen, &signatureLength, &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
            u_errorName(status));
        return;
    }
    if(encoding!=NULL ){
        charDataStart  += signatureLength;
        rawFileLen     -= signatureLength;
    }

    //
    // Open a converter to take the file to UTF-16
    //
    UConverter* conv;
    conv = ucnv_open(encoding, &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
        return;
    }

    //
    // Convert the file data to UChar.
    //  Preflight first to determine required buffer size.
    //
    uint32_t destCap = ucnv_toUChars(conv,
                       NULL,           //  dest,
                       0,              //  destCapacity,
                       charDataStart,
                       rawFileLen,
                       &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
        return;
    };
    
    status = U_ZERO_ERROR;
    ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
    ucnv_toUChars(conv,
        ucharBuf,           //  dest,
        destCap+1,
        charDataStart,
        rawFileLen,
        &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
        return;
    };
    ucnv_close(conv);
    
    //
    //  Successful conversion.  Set the global size variables so that
    //     the rest of the processing will proceed for this file.
    //
    fileLen = destCap;
}
    
    
    


//------------------------------------------------------------------------------------------
//
//   nextLine           Advance the line index variables, starting at the
//                      specified position in the input file buffer, by
//                      scanning forwrd until the next end-of-line.
//
//                      Need to take into account all of the possible Unicode
//                      line ending sequences.
//
//------------------------------------------------------------------------------------------
void nextLine(int  startPos) {
    if (startPos == 0) {
        lineNum = 0;
    } else {
        lineNum++;
    }
    lineStart = lineEnd = startPos;

    for (;;) {
        if (lineEnd >= fileLen) {
            return;
        }
        UChar c = ucharBuf[lineEnd];
        lineEnd++;
        if (c == 0x0a   ||       // Line Feed
            c == 0x0c   ||       // Form Feed
            c == 0x0d   ||       // Carriage Return
            c == 0x85   ||       // Next Line
            c == 0x2028 ||       // Line Separator
            c == 0x2029)         // Paragraph separator
        { 
            break;
        }
    }

    // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
    if (lineEnd < fileLen           &&
        ucharBuf[lineEnd-1] == 0x0d &&
        ucharBuf[lineEnd]   == 0x0a) 
    {
        lineEnd++;
    }
}


//------------------------------------------------------------------------------------------
//
//   printMatch         Called when a matching line has been located.
//                      Print out the line from the file with the match, after
//                         converting it back to the default code page.
//
//------------------------------------------------------------------------------------------
void printMatch() {
    char                buf[2000];
    UErrorCode         status       = U_ZERO_ERROR;

    // If we haven't already created a converter for output, do it now.
    if (outConverter == 0) {
        outConverter = ucnv_open(NULL, &status);
        if (U_FAILURE(status)) {
            fprintf(stderr, "ugrep:  Error opening default converter: \"%s\"\n",
                u_errorName(status));
            exit(-1);
        }
    };

    // Convert the line to be printed back to the default 8 bit code page.
    //   If the line is too long for our buffer, just truncate it.
    ucnv_fromUChars(outConverter,
                    buf,                   // destination buffer for conversion
                    sizeof(buf),           // capacity of destination buffer
                    &ucharBuf[lineStart],   // Input to conversion
                    lineEnd-lineStart,     // number of UChars to convert
                    &status);
    buf[sizeof(buf)-1] = 0;                // Add null for use in case of too long lines.
                                           // The converter null-terminates its output unless
                                           //   the buffer completely fills.
   
    if (displayFileName) {
        printf("%s:", fileName);
    }
    if (displayLineNum) {
        printf("%d:", lineNum);
    }
    printf("%s", buf);
}