vtk-dicom  0.8.17
vtkDICOMCharacterSet.h
1 /*=========================================================================
2 
3  Program: DICOM for VTK
4 
5  Copyright (c) 2012-2024 David Gobbi
6  All rights reserved.
7  See Copyright.txt or http://dgobbi.github.io/bsd3.txt for details.
8 
9  This software is distributed WITHOUT ANY WARRANTY; without even
10  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
11  PURPOSE. See the above copyright notice for more information.
12 
13 =========================================================================*/
14 #ifndef vtkDICOMCharacterSet_h
15 #define vtkDICOMCharacterSet_h
16 
17 #include "vtkSystemIncludes.h"
18 #include "vtkDICOMModule.h" // For export macro
19 #include "vtkDICOMConfig.h" // For configuration details
20 
21 #include <string>
22 
24 
54 class VTKDICOM_EXPORT vtkDICOMCharacterSet
55 {
56 public:
57  enum EnumType
58  {
59  ISO_IR_6 = 0, // US_ASCII
60  ISO_IR_13 = 1, // JIS X 0201, japanese romaji + katakana
61  ISO_IR_100 = 8, // ISO-8859-1, latin1, western europe
62  ISO_IR_101 = 9, // ISO-8859-2, latin2, central europe
63  ISO_IR_109 = 10, // ISO-8859-3, latin3, maltese
64  ISO_IR_110 = 11, // ISO-8859-4, latin4, baltic
65  ISO_IR_144 = 12, // ISO-8859-5, cyrillic
66  ISO_IR_127 = 13, // ISO-8859-6, arabic
67  ISO_IR_126 = 14, // ISO-8859-7, greek
68  ISO_IR_138 = 15, // ISO-8859-8, hebrew
69  ISO_IR_148 = 16, // ISO-8859-9, latin5, turkish
70  X_LATIN6 = 17, // ISO-8859-10, latin6, nordic
71  ISO_IR_166 = 18, // ISO-8859-11, thai
72  X_LATIN7 = 19, // ISO-8859-13, latin7, baltic rim
73  X_LATIN8 = 20, // ISO-8859-14, latin8, celtic
74  ISO_IR_203 = 21, // ISO-8859-15, latin9, western europe
75  X_LATIN9 = 21, // key from before ISO_IR 203 entered DICOM
76  X_LATIN10 = 22, // ISO-8859-16, latin10, southeastern europe
77  X_EUCKR = 24, // euc-kr, korean without escape codes
78  X_GB2312 = 25, // gb2312, chinese without escape codes
79  ISO_2022_IR_6 = 32, // US_ASCII
80  ISO_2022_IR_13 = 33, // JIS X 0201, japanese romaji and katakana
81  ISO_2022_IR_87 = 34, // JIS X 0208, iso-2022-jp with ascii
82  ISO_2022_IR_13_87 = 35, // JIS X 0201+0208, iso-2022-jp with romaji
83  ISO_2022_IR_159 = 36, // JIS X 0212, japanese supplementary
84  ISO_2022_IR_87_159 = 38, // JIS X 0208+0212, iso-2022-jp-2 subset
85  ISO_2022_IR_13_87_159 = 39, // JIS X 0201+0208+0212, iso-2022-jp-2 subset
86  ISO_2022_IR_100 = 40, // ISO-8859-1, latin1, western europe
87  ISO_2022_IR_101 = 41, // ISO-8859-2, latin2, central europe
88  ISO_2022_IR_109 = 42, // ISO-8859-3, latin3, maltese
89  ISO_2022_IR_110 = 43, // ISO-8859-4, latin4, baltic
90  ISO_2022_IR_144 = 44, // ISO-8859-5, cyrillic
91  ISO_2022_IR_127 = 45, // ISO-8859-6, arabic
92  ISO_2022_IR_126 = 46, // ISO-8859-7, greek
93  ISO_2022_IR_138 = 47, // ISO-8859-8, hebrew
94  ISO_2022_IR_148 = 48, // ISO-8859-9, latin5, turkish
95  ISO_2022_IR_166 = 50, // ISO-8859-11, thai
96  ISO_2022_IR_203 = 53, // ISO-8859-15, latin9, western europe
97  ISO_2022_IR_149 = 56, // KS X 1001, korean in G1 with escape codes
98  ISO_2022_IR_58 = 57, // GB2312, chinese in G1 with escape codes
99  X_ISO_2022_JP = 58, // iso-2022-jp with ascii and romaji
100  X_ISO_2022_JP_1 = 59, // like above, with addition of JIS X 0212
101  X_ISO_2022_JP_2 = 60, // adds chinese, korean, latin1, greek
102  X_ISO_2022_JP_EXT = 61, // iso-2022-jp-1 plus half-width katakana
103  ISO_IR_192 = 64, // UTF-8, unicode
104  GB18030 = 65, // gb18030, chinese with full unicode mapping
105  GBK = 66, // gbk, chinese
106  X_BIG5 = 67, // big5 + ETEN, traditional chinese
107  X_EUCJP = 69, // euc-jp, unix encoding for japanese
108  X_SJIS = 70, // windows-31j, aka shift-jis, code page 932
109  X_CP874 = 76, // cp1162, thai (windows-874)
110  X_CP1250 = 80, // cp1250, central europe
111  X_CP1251 = 81, // cp1251, cyrillic
112  X_CP1252 = 82, // cp1252, western europe
113  X_CP1253 = 83, // cp1253, greek
114  X_CP1254 = 84, // cp1254, turkish
115  X_CP1255 = 85, // cp1255, hebrew
116  X_CP1256 = 86, // cp1256, arabic
117  X_CP1257 = 87, // cp1257, baltic rim
118  X_CP1258 = 88, // cp1258, vietnamese
119  X_KOI8 = 90, // koi8, cyrillic
120  Unknown = 255 // signifies unknown character set
121  };
122 
124  vtkDICOMCharacterSet() : Key(0) {}
126 
128 
137  vtkDICOMCharacterSet(int k) : Key(static_cast<unsigned char>(k)) {}
138 
140 
144  explicit vtkDICOMCharacterSet(const std::string& name) {
145  this->Key = KeyFromString(name.data(), name.length()); }
146  vtkDICOMCharacterSet(const char *name, size_t nl) {
147  this->Key = KeyFromString(name, nl); }
149 
151 
160  GlobalDefault = cs.GetKey(); }
161  static vtkDICOMCharacterSet GetGlobalDefault() {
162  return GlobalDefault; }
163 
165 
170  static void SetGlobalOverride(bool b) {
171  GlobalOverride = b; }
172  static void GlobalOverrideOn() { GlobalOverride = true; }
173  static void GlobalOverrideOff() { GlobalOverride = false; }
174  static bool GetGlobalOverride() { return GlobalOverride; }
176 
178 
185  std::string GetCharacterSetString() const;
186 
188 
195  const char *GetDefinedTerm() const;
196 
198 
209  const char *GetMIMEName() const;
210 
212 
217  const char *GetName() const;
218 
220  unsigned char GetKey() const { return this->Key; }
222 
224 
234  std::string FromUTF8(const char *text, size_t l, size_t *lp=nullptr) const;
235  std::string FromUTF8(const std::string& text) const {
236  return FromUTF8(text.data(), text.length()); }
237 
239 
251  std::string ToUTF8(const char *text, size_t l, size_t *lp=nullptr) const;
252  std::string ToUTF8(const std::string& text) const {
253  return ToUTF8(text.data(), text.length()); }
254 
256  std::string ConvertToUTF8(const char *text, size_t l) const;
257 
259 
264  std::string ToSafeUTF8(const char *text, size_t l) const;
265  std::string ToSafeUTF8(const std::string& text) const {
266  return ToSafeUTF8(text.data(), text.length()); }
267 
269 
277  std::string CaseFoldedUTF8(const char *text, size_t l) const;
278  std::string CaseFoldedUTF8(const std::string& text) const {
279  return CaseFoldedUTF8(text.data(), text.length()); }
280 
282 
286  bool IsISO2022() const {
287  return ((this->Key & ISO_2022_MAX) == (this->Key | ISO_2022_MIN));
288  }
289 
291  bool IsISO8859() const {
292  return (this->Key >= ISO_IR_100 && this->Key <= X_LATIN10);
293  }
294 
296 
302  bool IsBiDirectional() const {
303  return (this->Key == ISO_IR_127 ||
304  this->Key == ISO_IR_138 ||
305  this->Key == X_CP1255 ||
306  this->Key == X_CP1256); }
308 
310 
316  unsigned int CountBackslashes(const char *text, size_t l) const;
317 
319 
323  size_t NextBackslash(const char *text, const char *end) const;
325 
327  bool operator==(vtkDICOMCharacterSet b) const { return (this->Key == b.Key); }
328  bool operator!=(vtkDICOMCharacterSet b) const { return (this->Key != b.Key); }
329  bool operator<=(vtkDICOMCharacterSet a) const { return (this->Key <= a.Key); }
330  bool operator>=(vtkDICOMCharacterSet a) const { return (this->Key >= a.Key); }
331  bool operator<(vtkDICOMCharacterSet a) const { return (this->Key < a.Key); }
332  bool operator>(vtkDICOMCharacterSet a) const { return (this->Key > a.Key); }
334 
335 private:
336 
337  // ISO-2022 Escape Codes
338  enum EscapeType {
339  CODE_ACS, // Announcer Code Sequence
340  CODE_CZD, // C0 Designate
341  CODE_C1D, // C1 Designate
342  CODE_GZD, // G0 Designate
343  CODE_G1D, // G1 Designate
344  CODE_G2D, // G2 Designate
345  CODE_G3D, // G3 Designate
346  CODE_DOCS, // Designate Other Coding System
347  CODE_CMD, // Coding Method Delimiter
348  CODE_IRR, // Identify Revised Registration
349  CODE_SS2, // Single Shift Two
350  CODE_SS3, // Single Shift Three
351  CODE_LS2, // Locking Shift Two
352  CODE_LS3, // Locking Shift Three
353  CODE_LS1R, // Locking Shift One Right
354  CODE_LS2R, // Locking Shift Two Right
355  CODE_LS3R, // Locking Shift Three Right
356  CODE_OTHER = 254, // Unrecognized
357  CODE_ERROR = 255 // Failure indicator
358  };
359 
360  // ISO-2022 State Bitfield
361  enum StateType {
362  ALTERNATE_CS = 0x00FF,
363  MULTIBYTE_G0 = 0x0100,
364  MULTIBYTE_G1 = 0x0200,
365  MULTIBYTE_G2 = 0x0400,
366  MULTIBYTE_G3 = 0x0800,
367  CHARSET96_GX = 0x1000,
368  CHARSET96_G1 = 0x2000,
369  CHARSET96_G2 = 0x4000,
370  CHARSET96_G3 = 0x8000
371  };
372 
373  // Other ISO-2022
374  enum {
375  DICOM_JP_BITS = 39,
376  ISO_2022_BASE = 31,
377  ISO_2022_MIN = 32,
378  ISO_2022_MAX = 63
379  };
380 
381  size_t AnyToUTF8(const char *t, size_t l, std::string *s, int m) const;
382  size_t UTF8ToSingleByte(const char *t, size_t l, std::string *s, int m) const;
383  size_t SingleByteToUTF8(const char *t, size_t l, std::string *s, int m) const;
384  size_t ISO8859ToUTF8(const char *t, size_t l, std::string *s, int m) const;
385  size_t UTF8ToISO2022(const char *t, size_t l, std::string *s, int m) const;
386  size_t ISO2022ToUTF8(const char *t, size_t l, std::string *s, int m) const;
387  size_t UTF8ToEUCKR(const char *t, size_t l, std::string *s, int m) const;
388  static size_t EUCKRToUTF8(const char *t, size_t l, std::string *s, int m);
389  static size_t UTF8ToGB2312(const char *t, size_t l, std::string *s, int m);
390  static size_t GB2312ToUTF8(const char *t, size_t l, std::string *s, int m);
391  static size_t UTF8ToGB18030(const char *t, size_t l, std::string *s, int m);
392  static size_t GB18030ToUTF8(const char *t, size_t l, std::string *s, int m);
393  static size_t UTF8ToGBK(const char *t, size_t l, std::string *s, int m);
394  static size_t GBKToUTF8(const char *t, size_t l, std::string *s, int m);
395  static size_t UTF8ToBig5(const char *t, size_t l, std::string *s, int m);
396  static size_t Big5ToUTF8(const char *t, size_t l, std::string *s, int m);
397  static size_t UTF8ToEUCJP(const char *t, size_t l, std::string *s, int m);
398  static size_t EUCJPToUTF8(const char *t, size_t l, std::string *s, int m);
399  static size_t UTF8ToSJIS(const char *t, size_t l, std::string *s, int m);
400  static size_t SJISToUTF8(const char *t, size_t l, std::string *s, int m);
401  static size_t UTF8ToJISX(
402  int charset, const char *t, size_t l, std::string *s, int m);
403  static size_t JISXToUTF8(
404  int csGL, int csGR, const char *t, size_t l, std::string *s, int m);
405  static size_t UTF8ToCP1258(const char *t, size_t l, std::string *s, int m);
406  static size_t CP1258ToUTF8(const char *t, size_t l, std::string *s, int m);
407  static size_t UTF8ToJISX0201(const char *t, size_t l, std::string *s, int m);
408 
409  unsigned int InitISO2022(unsigned char G[4]) const;
410  static EscapeType EscapeCode(const char *cp, size_t l, unsigned int *state);
411  unsigned char CharacterSetFromEscapeCodeJP(const char *code, size_t l) const;
412  unsigned char CharacterSetFromEscapeCode(const char *code, size_t l) const;
413  static unsigned char KeyFromString(const char *name, size_t nl);
414 
415  unsigned char Key;
416 
417  static unsigned char GlobalDefault;
418  static bool GlobalOverride;
419 
420  static const unsigned short *Table[256];
421  static const unsigned short *Reverse[256];
422 
423  static const int NumberOfAliases;
424  static const char *const Aliases[];
425  static const unsigned char AliasKeys[];
426 };
427 
428 VTKDICOM_EXPORT ostream& operator<<(ostream& o, const vtkDICOMCharacterSet& a);
429 
430 #endif /* vtkDICOMCharacterSet_h */
431 // VTK-HeaderTest-Exclude: vtkDICOMCharacterSet.h
Character sets.
Definition: vtkDICOMCharacterSet.h:55
bool IsISO8859() const
Returns true if this uses an ISO 8859 code page.
Definition: vtkDICOMCharacterSet.h:291
bool IsISO2022() const
Returns true if ISO 2022 escape codes are used.
Definition: vtkDICOMCharacterSet.h:286
bool IsBiDirectional() const
Check for bidirectional character sets.
Definition: vtkDICOMCharacterSet.h:302
std::string ToUTF8(const char *text, size_t l, size_t *lp=nullptr) const
Convert text from this encoding to UTF-8.
const char * GetName() const
Get a name that identifies this character set.
std::string ConvertToUTF8(const char *text, size_t l) const
Obsolete method for converting to UTF8.
vtkDICOMCharacterSet(const std::string &name)
Construct a character set object from a SpecificCharacterSet value.
Definition: vtkDICOMCharacterSet.h:144
std::string ToSafeUTF8(const char *text, size_t l) const
Convert text to UTF-8 that is safe to print to the console.
std::string GetCharacterSetString() const
Generate SpecificCharacterSet code values (diagnostic only).
size_t NextBackslash(const char *text, const char *end) const
Get the offset to the next backslash, or to the end of the string.
vtkDICOMCharacterSet(int k)
Construct a character set object from a given code.
Definition: vtkDICOMCharacterSet.h:137
const char * GetDefinedTerm() const
Get the defined term (possible multi-valued) for this character set.
unsigned char GetKey() const
Get the numerical code for this character set object.
Definition: vtkDICOMCharacterSet.h:220
std::string CaseFoldedUTF8(const char *text, size_t l) const
Convert text into a form suitable for case-insensitive matching.
const char * GetMIMEName() const
Get the internet MIME name for this character set.
static void SetGlobalDefault(vtkDICOMCharacterSet cs)
Set the character set to use if SpecificCharacterSet is missing.
Definition: vtkDICOMCharacterSet.h:159
static void SetGlobalOverride(bool b)
Override the value stored in SpecificCharacterSet with the default.
Definition: vtkDICOMCharacterSet.h:170
std::string FromUTF8(const char *text, size_t l, size_t *lp=nullptr) const
Convert text from UTF-8 to this encoding.
unsigned int CountBackslashes(const char *text, size_t l) const
Count the number of backslashes in an encoded string.