Manually decode input files as lenient UTF-8.

This commit is contained in:
Vidar Holen 2017-01-22 15:24:21 -08:00
parent 128d5d6013
commit 46fb91b44d
2 changed files with 47 additions and 3 deletions

View File

@ -175,6 +175,15 @@ ShellCheck uses the follow exit codes:
+ 3: ShellCheck was invoked with bad syntax (e.g. unknown flag). + 3: ShellCheck was invoked with bad syntax (e.g. unknown flag).
+ 4: ShellCheck was invoked with bad options (e.g. unknown formatter). + 4: ShellCheck was invoked with bad options (e.g. unknown formatter).
# LOCALE
This version of ShellCheck is only available in English. All files are
leniently decoded as UTF-8, with a fallback of ISO-8859-1 for invalid
sequences. `LC_CTYPE` is respected for output, and defaults to UTF-8 for
locales where encoding is unspecified (such as the `C` locale).
Windows users seeing `commitBuffer: invalid argument (invalid character)`
should set their terminal to use UTF-8 with `chcp 65001`.
# AUTHOR # AUTHOR
ShellCheck is written and maintained by Vidar Holen. ShellCheck is written and maintained by Vidar Holen.

View File

@ -31,6 +31,7 @@ import qualified ShellCheck.Formatter.TTY
import Control.Exception import Control.Exception
import Control.Monad import Control.Monad
import Control.Monad.Except import Control.Monad.Except
import Data.Bits
import Data.Char import Data.Char
import Data.Functor import Data.Functor
import Data.Either import Data.Either
@ -288,14 +289,48 @@ ioInterface options files = do
fallback path _ = return path fallback path _ = return path
inputFile file = do inputFile file = do
contents <- handle <-
if file == "-" if file == "-"
then getContents then return stdin
else readFile file else openBinaryFile file ReadMode
hSetBinaryMode handle True
contents <- decodeString <$> hGetContents handle -- closes handle
seq (length contents) $ seq (length contents) $
return contents return contents
-- Decode a char8 string into a utf8 string, with fallback on
-- ISO-8859-1. This avoids depending on additional libraries.
decodeString = decode
where
decode [] = []
decode (c:rest) | isAscii c = c : decode rest
decode (c:rest) =
let num = (fromIntegral $ ord c) :: Int
next = case num of
_ | num >= 0xFF -> Nothing
| num >= 0xFE -> construct (num .&. 0x00) 6 rest
| num >= 0xFC -> construct (num .&. 0x01) 5 rest
| num >= 0xF8 -> construct (num .&. 0x03) 4 rest
| num >= 0xF0 -> construct (num .&. 0x07) 3 rest
| num >= 0xE0 -> construct (num .&. 0x0F) 2 rest
| num >= 0xC0 -> construct (num .&. 0x1F) 1 rest
| True -> Nothing
in
case next of
Just (n, remainder) -> chr n : decode remainder
Nothing -> c : decode rest
construct x 0 rest = return (x, rest)
construct x n (c:rest) =
let num = (fromIntegral $ ord c) :: Int in
if num >= 0x80 && num <= 0xBF
then construct ((x `shiftL` 6) .|. (num .&. 0x3f)) (n-1) rest
else Nothing
construct _ _ _ = Nothing
verifyFiles files = verifyFiles files =
when (null files) $ do when (null files) $ do
printErr "No files specified.\n" printErr "No files specified.\n"