doc/api/vtkDICOMCharacterSet_8h_source.html

 /*=========================================================================


   Program: DICOM for VTK


   Copyright (c) 2012-2024 David Gobbi

   All rights reserved.

   See Copyright.txt or http://dgobbi.github.io/bsd3.txt for details.


      This software is distributed WITHOUT ANY WARRANTY; without even

      the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR

      PURPOSE.  See the above copyright notice for more information.


 =========================================================================*/

 #ifndef vtkDICOMCharacterSet_h

 #define vtkDICOMCharacterSet_h


 #include "vtkSystemIncludes.h"

 #include "vtkDICOMModule.h" // For export macro

 #include "vtkDICOMConfig.h" // For configuration details


 #include <string>


 class VTKDICOM_EXPORT vtkDICOMCharacterSet

 {

 public:

   enum EnumType

   {

     ISO_IR_6   = 0,  // US_ASCII

     ISO_IR_13  = 1,  // JIS X 0201,  japanese romaji + katakana

     ISO_IR_100 = 8,  // ISO-8859-1,  latin1, western europe

     ISO_IR_101 = 9,  // ISO-8859-2,  latin2, central europe

     ISO_IR_109 = 10, // ISO-8859-3,  latin3, maltese

     ISO_IR_110 = 11, // ISO-8859-4,  latin4, baltic

     ISO_IR_144 = 12, // ISO-8859-5,  cyrillic

     ISO_IR_127 = 13, // ISO-8859-6,  arabic

     ISO_IR_126 = 14, // ISO-8859-7,  greek

     ISO_IR_138 = 15, // ISO-8859-8,  hebrew

     ISO_IR_148 = 16, // ISO-8859-9,  latin5, turkish

     X_LATIN6   = 17, // ISO-8859-10, latin6, nordic

     ISO_IR_166 = 18, // ISO-8859-11, thai

     X_LATIN7   = 19, // ISO-8859-13, latin7, baltic rim

     X_LATIN8   = 20, // ISO-8859-14, latin8, celtic

     ISO_IR_203 = 21, // ISO-8859-15, latin9, western europe

     X_LATIN9   = 21, // key from before ISO_IR 203 entered DICOM

     X_LATIN10  = 22, // ISO-8859-16, latin10, southeastern europe

     X_EUCKR    = 24, // euc-kr,      korean without escape codes

     X_GB2312   = 25, // gb2312,      chinese without escape codes

     ISO_2022_IR_6   = 32, // US_ASCII

     ISO_2022_IR_13  = 33, // JIS X 0201,  japanese romaji and katakana

     ISO_2022_IR_87  = 34, // JIS X 0208,  iso-2022-jp with ascii

     ISO_2022_IR_13_87 = 35, // JIS X 0201+0208, iso-2022-jp with romaji

     ISO_2022_IR_159 = 36, // JIS X 0212,  japanese supplementary

     ISO_2022_IR_87_159 = 38, // JIS X 0208+0212, iso-2022-jp-2 subset

     ISO_2022_IR_13_87_159 = 39, // JIS X 0201+0208+0212, iso-2022-jp-2 subset

     ISO_2022_IR_100 = 40, // ISO-8859-1,  latin1, western europe

     ISO_2022_IR_101 = 41, // ISO-8859-2,  latin2, central europe

     ISO_2022_IR_109 = 42, // ISO-8859-3,  latin3, maltese

     ISO_2022_IR_110 = 43, // ISO-8859-4,  latin4, baltic

     ISO_2022_IR_144 = 44, // ISO-8859-5,  cyrillic

     ISO_2022_IR_127 = 45, // ISO-8859-6,  arabic

     ISO_2022_IR_126 = 46, // ISO-8859-7,  greek

     ISO_2022_IR_138 = 47, // ISO-8859-8,  hebrew

     ISO_2022_IR_148 = 48, // ISO-8859-9,  latin5, turkish

     ISO_2022_IR_166 = 50, // ISO-8859-11, thai

     ISO_2022_IR_203 = 53, // ISO-8859-15, latin9, western europe

     ISO_2022_IR_149 = 56, // KS X 1001, korean in G1 with escape codes

     ISO_2022_IR_58  = 57, // GB2312, chinese in G1 with escape codes

     X_ISO_2022_JP   = 58, // iso-2022-jp with ascii and romaji

     X_ISO_2022_JP_1 = 59, // like above, with addition of JIS X 0212

     X_ISO_2022_JP_2 = 60, // adds chinese, korean, latin1, greek

     X_ISO_2022_JP_EXT = 61, // iso-2022-jp-1 plus half-width katakana

     ISO_IR_192 = 64, // UTF-8,       unicode

     GB18030    = 65, // gb18030,     chinese with full unicode mapping

     GBK        = 66, // gbk,         chinese

     X_BIG5     = 67, // big5 + ETEN, traditional chinese

     X_EUCJP    = 69, // euc-jp,      unix encoding for japanese

     X_SJIS     = 70, // windows-31j, aka shift-jis, code page 932

     X_CP874    = 76, // cp1162,      thai (windows-874)

     X_CP1250   = 80, // cp1250,      central europe

     X_CP1251   = 81, // cp1251,      cyrillic

     X_CP1252   = 82, // cp1252,      western europe

     X_CP1253   = 83, // cp1253,      greek

     X_CP1254   = 84, // cp1254,      turkish

     X_CP1255   = 85, // cp1255,      hebrew

     X_CP1256   = 86, // cp1256,      arabic

     X_CP1257   = 87, // cp1257,      baltic rim

     X_CP1258   = 88, // cp1258,      vietnamese

     X_KOI8     = 90, // koi8,        cyrillic

     Unknown    = 255  // signifies unknown character set

   };


   vtkDICOMCharacterSet() : Key(0) {}


   vtkDICOMCharacterSet(int k) : Key(static_cast<unsigned char>(k)) {}


   explicit vtkDICOMCharacterSet(const std::string& name) {

     this->Key = KeyFromString(name.data(), name.length()); }

   vtkDICOMCharacterSet(const char *name, size_t nl) {

     this->Key = KeyFromString(name, nl); }


   static void SetGlobalDefault(vtkDICOMCharacterSet cs) {

     GlobalDefault = cs.GetKey(); }

   static vtkDICOMCharacterSet GetGlobalDefault() {

     return GlobalDefault; }


   static void SetGlobalOverride(bool b) {

     GlobalOverride = b; }

   static void GlobalOverrideOn() { GlobalOverride = true; }

   static void GlobalOverrideOff() { GlobalOverride = false; }

   static bool GetGlobalOverride() { return GlobalOverride; }


   std::string GetCharacterSetString() const;


   const char *GetDefinedTerm() const;


   const char *GetMIMEName() const;


   const char *GetName() const;


   unsigned char GetKey() const { return this->Key; }


   std::string FromUTF8(const char *text, size_t l, size_t *lp=nullptr) const;

   std::string FromUTF8(const std::string& text) const {

     return FromUTF8(text.data(), text.length()); }


   std::string ToUTF8(const char *text, size_t l, size_t *lp=nullptr) const;

   std::string ToUTF8(const std::string& text) const {

     return ToUTF8(text.data(), text.length()); }


   std::string ConvertToUTF8(const char *text, size_t l) const;


   std::string ToSafeUTF8(const char *text, size_t l) const;

   std::string ToSafeUTF8(const std::string& text) const {

     return ToSafeUTF8(text.data(), text.length()); }


   std::string CaseFoldedUTF8(const char *text, size_t l) const;

   std::string CaseFoldedUTF8(const std::string& text) const {

     return CaseFoldedUTF8(text.data(), text.length()); }


   bool IsISO2022() const {

     return ((this->Key & ISO_2022_MAX) == (this->Key | ISO_2022_MIN));

   }


   bool IsISO8859() const {

     return (this->Key >= ISO_IR_100 && this->Key <= X_LATIN10);

   }


   bool IsBiDirectional() const {

     return (this->Key == ISO_IR_127 ||

             this->Key == ISO_IR_138 ||

             this->Key == X_CP1255 ||

             this->Key == X_CP1256); }


   unsigned int CountBackslashes(const char *text, size_t l) const;


   size_t NextBackslash(const char *text, const char *end) const;


   bool operator==(vtkDICOMCharacterSet b) const { return (this->Key == b.Key); }

   bool operator!=(vtkDICOMCharacterSet b) const { return (this->Key != b.Key); }

   bool operator<=(vtkDICOMCharacterSet a) const { return (this->Key <= a.Key); }

   bool operator>=(vtkDICOMCharacterSet a) const { return (this->Key >= a.Key); }

   bool operator<(vtkDICOMCharacterSet a) const { return (this->Key < a.Key); }

   bool operator>(vtkDICOMCharacterSet a) const { return (this->Key > a.Key); }


 private:


   // ISO-2022 Escape Codes

   enum EscapeType {

     CODE_ACS,  // Announcer Code Sequence

     CODE_CZD,  // C0 Designate

     CODE_C1D,  // C1 Designate

     CODE_GZD,  // G0 Designate

     CODE_G1D,  // G1 Designate

     CODE_G2D,  // G2 Designate

     CODE_G3D,  // G3 Designate

     CODE_DOCS, // Designate Other Coding System

     CODE_CMD,  // Coding Method Delimiter

     CODE_IRR,  // Identify Revised Registration

     CODE_SS2,  // Single Shift Two

     CODE_SS3,  // Single Shift Three

     CODE_LS2,  // Locking Shift Two

     CODE_LS3,  // Locking Shift Three

     CODE_LS1R, // Locking Shift One Right

     CODE_LS2R, // Locking Shift Two Right

     CODE_LS3R,  // Locking Shift Three Right

     CODE_OTHER = 254, // Unrecognized

     CODE_ERROR = 255 // Failure indicator

   };


   // ISO-2022 State Bitfield

   enum StateType {

     ALTERNATE_CS = 0x00FF,

     MULTIBYTE_G0 = 0x0100,

     MULTIBYTE_G1 = 0x0200,

     MULTIBYTE_G2 = 0x0400,

     MULTIBYTE_G3 = 0x0800,

     CHARSET96_GX = 0x1000,

     CHARSET96_G1 = 0x2000,

     CHARSET96_G2 = 0x4000,

     CHARSET96_G3 = 0x8000

   };


   // Other ISO-2022

   enum {

     DICOM_JP_BITS = 39,

     ISO_2022_BASE = 31,

     ISO_2022_MIN = 32,

     ISO_2022_MAX = 63

   };


   size_t AnyToUTF8(const char *t, size_t l, std::string *s, int m) const;

   size_t UTF8ToSingleByte(const char *t, size_t l, std::string *s, int m) const;

   size_t SingleByteToUTF8(const char *t, size_t l, std::string *s, int m) const;

   size_t ISO8859ToUTF8(const char *t, size_t l, std::string *s, int m) const;

   size_t UTF8ToISO2022(const char *t, size_t l, std::string *s, int m) const;

   size_t ISO2022ToUTF8(const char *t, size_t l, std::string *s, int m) const;

   size_t UTF8ToEUCKR(const char *t, size_t l, std::string *s, int m) const;

   static size_t EUCKRToUTF8(const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToGB2312(const char *t, size_t l, std::string *s, int m);

   static size_t GB2312ToUTF8(const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToGB18030(const char *t, size_t l, std::string *s, int m);

   static size_t GB18030ToUTF8(const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToGBK(const char *t, size_t l, std::string *s, int m);

   static size_t GBKToUTF8(const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToBig5(const char *t, size_t l, std::string *s, int m);

   static size_t Big5ToUTF8(const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToEUCJP(const char *t, size_t l, std::string *s, int m);

   static size_t EUCJPToUTF8(const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToSJIS(const char *t, size_t l, std::string *s, int m);

   static size_t SJISToUTF8(const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToJISX(

     int charset, const char *t, size_t l, std::string *s, int m);

   static size_t JISXToUTF8(

     int csGL, int csGR, const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToCP1258(const char *t, size_t l, std::string *s, int m);

   static size_t CP1258ToUTF8(const char *t, size_t l, std::string *s, int m);

   static size_t UTF8ToJISX0201(const char *t, size_t l, std::string *s, int m);


   unsigned int InitISO2022(unsigned char G[4]) const;

   static EscapeType EscapeCode(const char *cp, size_t l, unsigned int *state);

   unsigned char CharacterSetFromEscapeCodeJP(const char *code, size_t l) const;

   unsigned char CharacterSetFromEscapeCode(const char *code, size_t l) const;

   static unsigned char KeyFromString(const char *name, size_t nl);


   unsigned char Key;


   static unsigned char GlobalDefault;

   static bool GlobalOverride;


   static const unsigned short *Table[256];

   static const unsigned short *Reverse[256];


   static const int NumberOfAliases;

   static const char *const Aliases[];

   static const unsigned char AliasKeys[];

 };


 VTKDICOM_EXPORT ostream& operator<<(ostream& o, const vtkDICOMCharacterSet& a);


 #endif /* vtkDICOMCharacterSet_h */

 // VTK-HeaderTest-Exclude: vtkDICOMCharacterSet.h

vtkDICOMCharacterSet
Character sets.
Definition: vtkDICOMCharacterSet.h:55

vtkDICOMCharacterSet::IsISO8859
bool IsISO8859() const
Returns true if this uses an ISO 8859 code page.
Definition: vtkDICOMCharacterSet.h:291

vtkDICOMCharacterSet::IsISO2022
bool IsISO2022() const
Returns true if ISO 2022 escape codes are used.
Definition: vtkDICOMCharacterSet.h:286

vtkDICOMCharacterSet::IsBiDirectional
bool IsBiDirectional() const
Check for bidirectional character sets.
Definition: vtkDICOMCharacterSet.h:302

vtkDICOMCharacterSet::ToUTF8
std::string ToUTF8(const char *text, size_t l, size_t *lp=nullptr) const
Convert text from this encoding to UTF-8.

vtkDICOMCharacterSet::GetName
const char * GetName() const
Get a name that identifies this character set.

vtkDICOMCharacterSet::ConvertToUTF8
std::string ConvertToUTF8(const char *text, size_t l) const
Obsolete method for converting to UTF8.

vtkDICOMCharacterSet::vtkDICOMCharacterSet
vtkDICOMCharacterSet(const std::string &name)
Construct a character set object from a SpecificCharacterSet value.
Definition: vtkDICOMCharacterSet.h:144

vtkDICOMCharacterSet::ToSafeUTF8
std::string ToSafeUTF8(const char *text, size_t l) const
Convert text to UTF-8 that is safe to print to the console.

vtkDICOMCharacterSet::GetCharacterSetString
std::string GetCharacterSetString() const
Generate SpecificCharacterSet code values (diagnostic only).

vtkDICOMCharacterSet::NextBackslash
size_t NextBackslash(const char *text, const char *end) const
Get the offset to the next backslash, or to the end of the string.

vtkDICOMCharacterSet::vtkDICOMCharacterSet
vtkDICOMCharacterSet(int k)
Construct a character set object from a given code.
Definition: vtkDICOMCharacterSet.h:137

vtkDICOMCharacterSet::GetDefinedTerm
const char * GetDefinedTerm() const
Get the defined term (possible multi-valued) for this character set.

vtkDICOMCharacterSet::GetKey
unsigned char GetKey() const
Get the numerical code for this character set object.
Definition: vtkDICOMCharacterSet.h:220

vtkDICOMCharacterSet::CaseFoldedUTF8
std::string CaseFoldedUTF8(const char *text, size_t l) const
Convert text into a form suitable for case-insensitive matching.

vtkDICOMCharacterSet::GetMIMEName
const char * GetMIMEName() const
Get the internet MIME name for this character set.

vtkDICOMCharacterSet::SetGlobalDefault
static void SetGlobalDefault(vtkDICOMCharacterSet cs)
Set the character set to use if SpecificCharacterSet is missing.
Definition: vtkDICOMCharacterSet.h:159

vtkDICOMCharacterSet::SetGlobalOverride
static void SetGlobalOverride(bool b)
Override the value stored in SpecificCharacterSet with the default.
Definition: vtkDICOMCharacterSet.h:170

vtkDICOMCharacterSet::FromUTF8
std::string FromUTF8(const char *text, size_t l, size_t *lp=nullptr) const
Convert text from UTF-8 to this encoding.

vtkDICOMCharacterSet::CountBackslashes
unsigned int CountBackslashes(const char *text, size_t l) const
Count the number of backslashes in an encoded string.