Unicode String Functions (code snippet)

Description

The standard Lua String Manipulation functions are designed to work with ANSI encoded text, and some do not handle Unicode UTF-8 encoded text correctly.

The following functions supplement the Lua string library, and adapt to the current string encoding in either fh V5 or fh V6.

Requires: None

Code

UnicodeStringFunctions.fh_lua
-- Supply current file encoding format --
function encoding()
	if fhGetAppVersion() > 5 then return fhGetStringEncoding() end
	return "ANSI"
end -- function encoding
 
local dicUpper = { }
local dicLower = { }
local dicCaseX = { }
 
-- ASCII unaccented letter translations for Upper, Lower, and Case Insensitive
for intUpper = string.byte("A"), string.byte("Z") do
   local strUpper = string.char(intUpper)
   local strLower = string.char(intUpper - string.byte("A") + string.byte("a"))
   dicUpper[strLower] = strUpper
   dicLower[strUpper] = strLower
   local strCaseX = "["..strUpper..strLower.."]"
   dicCaseX[strLower] = strCaseX
   dicCaseX[strUpper] = strCaseX
end
 
-- Supply character length of ANSI text --
function length(strTxt)
   return string.len(strTxt or "")
end -- function length
 
-- Supply character substring of ANSI text --
function substring(strTxt,i,j)
   return string.sub(strTxt or "",i,j)
end -- function substring
 
-- Translate upper/lower case ANSI letters to pattern that matches both --
function caseless(strTxt)
   strTxt = tostring(strTxt or ""):gsub("[A-Za-z]",dicCaseX)
   return strTxt
end -- function caseless
 
if encoding() == "UTF-8" then
 
   -- Supply character length of UTF-8 text --
   function length(strTxt)
      isFlag = fhIsConversionLossFlagSet()
      strTxt = fhConvertUTF8toANSI(strTxt or "")
      fhSetConversionLossFlag(isFlag)
      return string.len(strTxt)
   end -- function length
 
   -- Supply character substring of UTF-8 text --
   function substring(strTxt,i,j)
      local strSub = ""
      j = j or -1
      if j < 0 then j = j + length(strTxt) + 1 end
      if i < 0 then i = i + length(strTxt) + 1 end
      for strChr in string.gmatch(strTxt or "","([%z\1-\127\194-\244][\128-\191]*)") do
         if j <= 0 then break end
         j = j - 1
         i = i - 1
         if i <= 0 then strSub = strSub..strChr end
      end
      return strSub
   end -- function substring
 
   -- Translate lower case to upper case UTF-8 letters --
   function upper(strTxt)
      strTxt = tostring(strTxt or ""):gsub("([a-z\194-\244][\128-\191]*)",dicUpper)
      return strTxt
   end -- function upper
 
   -- Translate upper case to lower case UTF-8 letters --
   function lower(strTxt)
      strTxt = tostring(strTxt or ""):gsub("([A-Z\194-\244][\128-\191]*)",dicLower)
      return strTxt
   end -- function lower
 
   -- Translate upper/lower case UTF-8 letters to pattern that matches both --
   function caseless(strTxt)
      strTxt = tostring(strTxt or ""):gsub("([A-Za-z\194-\244][\128-\191]*)",dicCaseX)
      return strTxt
   end -- function caseless
 
   -- Following tables use ASCII numeric coding to be immune from ANSI/UTF-8 encoding --
 
   local arrPairs =   -- Upper & Lower case groups of UTF-8 letters with same prefix --
   {--   { Prefix, Beg , End , Inc, Offset Upper > Lower },   -- These include all ANSI letters and more
      { "\195", 0x80, 0x96,  1 , 32 },   -- 195=0xC3 À U+00C0 to Ö U+00D6 and à U+00E0 to ö U+00F6
      { "\195", 0x98, 0x9E,  1 , 32 },   -- 195=0xC3 Ø U+00D8 to Þ U+00DE and ø U+00F8 to þ U+00FE
      { "\196", 0x80, 0xB6,  2 ,  1 },   -- 196=0xC4 Ā U+0100 to ķ U+0137 in pairs
      { "\196", 0xB9, 0xBD,  2 ,  1 },   -- 196=0xC4 Ĺ U+0139 to ľ U+013E in pairs
      { "\197", 0x81, 0x87,  2 ,  1 },   -- 197=0xC5 Ł U+0141 to ň U+0148 in pairs
      { "\197", 0x8A, 0xB6,  2 ,  1 },   -- 197=0xC5 Ŋ U+014A to ŷ U+0177 in pairs
      { "\197", 0xB9, 0xBD,  2 ,  1 },   -- 197=0xC5 Ź U+0179 to ž U+017E in pairs
      { "\198", 0x82, 0x84,  2 ,  1 },   -- 198=0xC6 Ƃ  U+0182 to ƅ  U+0185 in pairs
      -- Add more Unicode groups here as usage increases --
   }
   local dicPairs =   -- Upper v Lower case UTF-8 letters that don't fit groups above --
   {   [string.char(0xC4,0xBF)] = string.char(0xC5,0x80),   -- Ŀ U+013F and ŀ U+0140
      [string.char(0xC5,0xB8)] = string.char(0xC3,0xBF),    -- Ÿ U+0178 and ÿ U+00FF
   }
 
   -- Populate the UTF-8 letter translation dictionaries --
   for intGroup, tblGroup in ipairs ( arrPairs ) do   -- UTF-8 accented letter groups
      strPrefix = tblGroup[1]
      for intUpper = tblGroup[2], tblGroup[3], tblGroup[4] do
         local strUpper = string.char(intUpper)
         local strLower = string.char(intUpper + tblGroup[5])
         local strCaseX = strPrefix.."["..strUpper..strLower.."]"
         strUpper = strPrefix..strUpper
         strLower = strPrefix..strLower
         dicUpper[strLower] = strUpper
         dicLower[strUpper] = strLower
         dicCaseX[strLower] = strCaseX
         dicCaseX[strUpper] = strCaseX
      end
   end
   for strUpper, strLower in pairs ( dicPairs ) do   -- UTF-8 accented letters where upper & lower have different prefix
      dicUpper[strLower] = strUpper
      dicLower[strUpper] = strLower
      local strCaseX = ""
      for intByte = 1, #strUpper do         -- Matches more than just the two letters, but can't do any better
         strCaseX = strCaseX.."["..strUpper:sub(intByte,intByte)..strLower:sub(intByte,intByte).."]"
      end
      dicCaseX[strLower] = strCaseX
      dicCaseX[strUpper] = strCaseX
   end
 
end

Usage

The text string may be ANSI encoded where each character is one byte, or it may be UTF-8 encoded where Ø and ø are both two bytes.

local strText = "AaØøZz"
print( string.upper(strText) )          -->> AAØØZZ
print( string.lower(strText) )          -->> aaøøzz
print( string.len(strText) )            -->> 6 or 8   length in bytes
print( string.length(strText) )         -->> 6        length in characters
print( string.sub(strText,3,4) )        -->> Øø or Ø  substring in bytes
print( string.substring(strText,3,4) )  -->> Øø       substring in characters