How to read utf16 text file to string in golang?

UTF16, UTF8, and Byte Order Marks are defined by the Unicode Consortium: UTF-16 FAQ, UTF-8 FAQ, and Byte Order Mark (BOM) FAQ.

Issue 4802: bufio: reading lines is too cumbersome

Reading lines from a file is too cumbersome in Go.

People are often drawn to bufio.Reader.ReadLine because of its name, but it has a weird signature, returning (line []byte, isPrefix bool, err error), and requires a lot of work.

ReadSlice and ReadString require a delimiter byte, which is almost always the obvious and unsightly '\n', and also can return both a line and an EOF

Revision: f685026a2d38

bufio: new Scanner interface

Add a new, simple interface for scanning (probably textual) data, based on a new type called Scanner. It does its own internal buffering, so should be plausibly efficient even without injecting a bufio.Reader. The format of the input is defined by a "split function", by default splitting into lines.

go1.1beta1 released

You can download binary and source distributions from the usual place: https://code.google.com/p/go/downloads/list?q=go1.1beta1

Here's a program which uses the Unicode rules to convert UTF16 text file lines to Go UTF8 encoded strings. The code has been revised to take advantage of the new bufio.Scanner interface in Go 1.1.

package main

import (
    "bufio"
    "bytes"
    "encoding/binary"
    "fmt"
    "os"
    "runtime"
    "unicode/utf16"
    "unicode/utf8"
)

// UTF16BytesToString converts UTF-16 encoded bytes, in big or little endian byte order,
// to a UTF-8 encoded string.
func UTF16BytesToString(b []byte, o binary.ByteOrder) string {
    utf := make([]uint16, (len(b)+(2-1))/2)
    for i := 0; i+(2-1) < len(b); i += 2 {
        utf[i/2] = o.Uint16(b[i:])
    }
    if len(b)/2 < len(utf) {
        utf[len(utf)-1] = utf8.RuneError
    }
    return string(utf16.Decode(utf))
}

// UTF-16 endian byte order
const (
    unknownEndian = iota
    bigEndian
    littleEndian
)

// dropCREndian drops a terminal \r from the endian data.
func dropCREndian(data []byte, t1, t2 byte) []byte {
    if len(data) > 1 {
        if data[len(data)-2] == t1 && data[len(data)-1] == t2 {
            return data[0 : len(data)-2]
        }
    }
    return data
}

// dropCRBE drops a terminal \r from the big endian data.
func dropCRBE(data []byte) []byte {
    return dropCREndian(data, '\x00', '\r')
}

// dropCRLE drops a terminal \r from the little endian data.
func dropCRLE(data []byte) []byte {
    return dropCREndian(data, '\r', '\x00')
}

// dropCR drops a terminal \r from the data.
func dropCR(data []byte) ([]byte, int) {
    var endian = unknownEndian
    switch ld := len(data); {
    case ld != len(dropCRLE(data)):
        endian = littleEndian
    case ld != len(dropCRBE(data)):
        endian = bigEndian
    }
    return data, endian
}

// SplitFunc is a split function for a Scanner that returns each line of
// text, stripped of any trailing end-of-line marker. The returned line may
// be empty. The end-of-line marker is one optional carriage return followed
// by one mandatory newline. In regular expression notation, it is `\r?\n`.
// The last non-empty line of input will be returned even if it has no
// newline.
func ScanUTF16LinesFunc(byteOrder binary.ByteOrder) (bufio.SplitFunc, func() binary.ByteOrder) {

    // Function closure variables
    var endian = unknownEndian
    switch byteOrder {
    case binary.BigEndian:
        endian = bigEndian
    case binary.LittleEndian:
        endian = littleEndian
    }
    const bom = 0xFEFF
    var checkBOM bool = endian == unknownEndian

    // Scanner split function
    splitFunc := func(data []byte, atEOF bool) (advance int, token []byte, err error) {

        if atEOF && len(data) == 0 {
            return 0, nil, nil
        }

        if checkBOM {
            checkBOM = false
            if len(data) > 1 {
                switch uint16(bom) {
                case uint16(data[0])<<8 | uint16(data[1]):
                    endian = bigEndian
                    return 2, nil, nil
                case uint16(data[1])<<8 | uint16(data[0]):
                    endian = littleEndian
                    return 2, nil, nil
                }
            }
        }

        // Scan for newline-terminated lines.
        i := 0
        for {
            j := bytes.IndexByte(data[i:], '\n')
            if j < 0 {
                break
            }
            i += j
            switch e := i % 2; e {
            case 1: // UTF-16BE
                if endian != littleEndian {
                    if i > 1 {
                        if data[i-1] == '\x00' {
                            endian = bigEndian
                            // We have a full newline-terminated line.
                            return i + 1, dropCRBE(data[0 : i-1]), nil
                        }
                    }
                }
            case 0: // UTF-16LE
                if endian != bigEndian {
                    if i+1 < len(data) {
                        i++
                        if data[i] == '\x00' {
                            endian = littleEndian
                            // We have a full newline-terminated line.
                            return i + 1, dropCRLE(data[0 : i-1]), nil
                        }
                    }
                }
            }
            i++
        }

        // If we're at EOF, we have a final, non-terminated line. Return it.
        if atEOF {
            // drop CR.
            advance = len(data)
            switch endian {
            case bigEndian:
                data = dropCRBE(data)
            case littleEndian:
                data = dropCRLE(data)
            default:
                data, endian = dropCR(data)
            }
            if endian == unknownEndian {
                if runtime.GOOS == "windows" {
                    endian = littleEndian
                } else {
                    endian = bigEndian
                }
            }
            return advance, data, nil
        }

        // Request more data.
        return 0, nil, nil
    }

    // Endian byte order function
    orderFunc := func() (byteOrder binary.ByteOrder) {
        switch endian {
        case bigEndian:
            byteOrder = binary.BigEndian
        case littleEndian:
            byteOrder = binary.LittleEndian
        }
        return byteOrder
    }

    return splitFunc, orderFunc
}

func main() {
    file, err := os.Open("utf16.le.txt")
    if err != nil {
        fmt.Println(err)
        os.Exit(1)
    }
    defer file.Close()
    fmt.Println(file.Name())

    rdr := bufio.NewReader(file)
    scanner := bufio.NewScanner(rdr)
    var bo binary.ByteOrder // unknown, infer from data
    // bo = binary.LittleEndian // windows
    splitFunc, orderFunc := ScanUTF16LinesFunc(bo)
    scanner.Split(splitFunc)

    for scanner.Scan() {
        b := scanner.Bytes()
        s := UTF16BytesToString(b, orderFunc())
        fmt.Println(len(s), s)
        fmt.Println(len(b), b)
    }
    fmt.Println(orderFunc())

    if err := scanner.Err(); err != nil {
        fmt.Println(err)
    }
}

Output:

utf16.le.txt
15 "Hello, 世界"
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
0 
0 []
15 "Hello, 世界"
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
LittleEndian

utf16.be.txt
15 "Hello, 世界"
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
0 
0 []
15 "Hello, 世界"
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
BigEndian

Here is the simplest way to read it:

package main

import (
    "bufio"
    "fmt"
    "log"
    "os"

    "golang.org/x/text/encoding/unicode"
    "golang.org/x/text/transform"
)

func main() {
    file, err := os.Open("./text.txt")
    if err != nil {
        log.Fatal(err)
    }

    scanner := bufio.NewScanner(transform.NewReader(file, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewDecoder()))
    for scanner.Scan() {
        fmt.Printf(scanner.Text())
    }
}

since Windows use little-endian order by default link, we use unicode.UseBOM policy to retrieve BOM from the text, and unicode.LittleEndian as a fallback

The latest version of golang.org/x/text/encoding/unicode makes it easier to do this because it includes unicode.BOMOverride, which will intelligently interpret the BOM.

Here is ReadFileUTF16(), which is like os.ReadFile() but decodes UTF-16.

package main

import (
    "bytes"
    "fmt"
    "io/ioutil"
    "log"
    "strings"

    "golang.org/x/text/encoding/unicode"
    "golang.org/x/text/transform"
)

// Similar to ioutil.ReadFile() but decodes UTF-16.  Useful when
// reading data from MS-Windows systems that generate UTF-16BE files,
// but will do the right thing if other BOMs are found.
func ReadFileUTF16(filename string) ([]byte, error) {

    // Read the file into a []byte:
    raw, err := ioutil.ReadFile(filename)
    if err != nil {
        return nil, err
    }

    // Make an tranformer that converts MS-Win default to UTF8:
    win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
    // Make a transformer that is like win16be, but abides by BOM:
    utf16bom := unicode.BOMOverride(win16be.NewDecoder())

    // Make a Reader that uses utf16bom:
    unicodeReader := transform.NewReader(bytes.NewReader(raw), utf16bom)

    // decode and print:
    decoded, err := ioutil.ReadAll(unicodeReader)
    return decoded, err
}

func main() {
    data, err := ReadFileUTF16("inputfile.txt")
    if err != nil {
        log.Fatal(err)
    }
    final := strings.Replace(string(data), "\r\n", "\n", -1)
    fmt.Println(final)

}

Here is NewScannerUTF16 which is like os.Open() but returns a scanner.

package main

import (
    "bufio"
    "fmt"
    "log"
    "os"

    "golang.org/x/text/encoding/unicode"
    "golang.org/x/text/transform"
)

type utfScanner interface {
    Read(p []byte) (n int, err error)
}

// Creates a scanner similar to os.Open() but decodes the file as UTF-16.
// Useful when reading data from MS-Windows systems that generate UTF-16BE
// files, but will do the right thing if other BOMs are found.
func NewScannerUTF16(filename string) (utfScanner, error) {

    // Read the file into a []byte:
    file, err := os.Open(filename)
    if err != nil {
        return nil, err
    }

    // Make an tranformer that converts MS-Win default to UTF8:
    win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
    // Make a transformer that is like win16be, but abides by BOM:
    utf16bom := unicode.BOMOverride(win16be.NewDecoder())

    // Make a Reader that uses utf16bom:
    unicodeReader := transform.NewReader(file, utf16bom)
    return unicodeReader, nil
}

func main() {

    s, err := NewScannerUTF16("inputfile.txt")
    if err != nil {
        log.Fatal(err)
    }

    scanner := bufio.NewScanner(s)
    for scanner.Scan() {
        fmt.Println(scanner.Text()) // Println will add back the final '\n'
    }
    if err := scanner.Err(); err != nil {
        fmt.Fprintln(os.Stderr, "reading inputfile:", err)
    }

}

FYI: I have put these functions into an open source module and have made further improvements. See https://github.com/TomOnTime/utfutil/

How to read utf16 text file to string in golang?

Tags:

Unicode

Utf 16

Readline

Go

Related

Recent Posts