Skip to content

Instantly share code, notes, and snippets.

@flying19880517
Created July 7, 2012 16:27
Show Gist options
  • Select an option

  • Save flying19880517/3067078 to your computer and use it in GitHub Desktop.

Select an option

Save flying19880517/3067078 to your computer and use it in GitHub Desktop.

Revisions

  1. Lionheart created this gist Jul 7, 2012.
    58 changes: 58 additions & 0 deletions isUtf8File.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,58 @@


    bool MainWindow::isUtf8File(QIODevice *file)
    {
    const int testSize = 1024;
    char str[testSize];///
    int size = file->peek(str, testSize);

    // char buf[3];
    // if (f->peek(buf, sizeof(buf)) == sizeof(buf))
    // return (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF);

    int encodingBytesCount = 0;
    bool allTextsAreASCIIChars = true;

    for (int i = 0; i < size; ++i){
    char current = str[i];

    if ((current & 0x80) == 0x80)
    allTextsAreASCIIChars = false;

    // First byte
    if (encodingBytesCount == 0){
    if ((current & 0x80) == 0)
    continue;// ASCII chars, from 0x00-0x7F

    if ((current & 0xC0) == 0xC0){
    encodingBytesCount = 1;
    current <<= 2;
    // More than two bytes used to encoding a unicode char.
    // Calculate the real length.
    while ((current & 0x80) == 0x80){
    current <<= 1;
    ++encodingBytesCount;
    }
    }else{
    // Invalid bits structure for UTF8 encoding rule.
    return false;
    }
    }else{
    // Following bytes, must start with 10.
    if ((current & 0xC0) == 0x80)
    --encodingBytesCount;
    else
    return false;
    }
    }

    // if(encodingBytesCount != 0)
    // {
    // // Invalid bits structure for UTF8 encoding rule.
    // // Wrong following bytes count.
    // return false;
    // }

    // Although UTF8 supports encoding for ASCII chars, we regard as a input stream, whose contents are all ASCII as default encoding.
    return !allTextsAreASCIIChars;
    }