vtk-dicom  0.8.14
vtkDICOMCharacterSet.h
1 /*=========================================================================
2 
3  Program: DICOM for VTK
4 
5  Copyright (c) 2012-2022 David Gobbi
6  All rights reserved.
7  See Copyright.txt or http://dgobbi.github.io/bsd3.txt for details.
8 
9  This software is distributed WITHOUT ANY WARRANTY; without even
10  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
11  PURPOSE. See the above copyright notice for more information.
12 
13 =========================================================================*/
14 #ifndef vtkDICOMCharacterSet_h
15 #define vtkDICOMCharacterSet_h
16 
17 #include "vtkSystemIncludes.h"
18 #include "vtkDICOMModule.h" // For export macro
19 #include "vtkDICOMConfig.h" // For configuration details
20 
21 #include <string>
22 
24 
54 class VTKDICOM_EXPORT vtkDICOMCharacterSet
55 {
56 public:
57  enum EnumType
58  {
59  ISO_IR_6 = 0, // US_ASCII
60  ISO_IR_13 = 1, // JIS X 0201, japanese romaji + katakana
61  ISO_IR_100 = 8, // ISO-8859-1, latin1, western europe
62  ISO_IR_101 = 9, // ISO-8859-2, latin2, central europe
63  ISO_IR_109 = 10, // ISO-8859-3, latin3, maltese
64  ISO_IR_110 = 11, // ISO-8859-4, latin4, baltic
65  ISO_IR_144 = 12, // ISO-8859-5, cyrillic
66  ISO_IR_127 = 13, // ISO-8859-6, arabic
67  ISO_IR_126 = 14, // ISO-8859-7, greek
68  ISO_IR_138 = 15, // ISO-8859-8, hebrew
69  ISO_IR_148 = 16, // ISO-8859-9, latin5, turkish
70  X_LATIN6 = 17, // ISO-8859-10, latin6, nordic
71  ISO_IR_166 = 18, // ISO-8859-11, thai
72  X_LATIN7 = 19, // ISO-8859-13, latin7, baltic rim
73  X_LATIN8 = 20, // ISO-8859-14, latin8, celtic
74  X_LATIN9 = 21, // ISO-8859-15, latin9, western europe
75  X_LATIN10 = 22, // ISO-8859-16, latin10, southeastern europe
76  X_EUCKR = 24, // euc-kr, ISO_IR_149 without escape codes
77  X_GB2312 = 25, // gb2312, ISO_IR_58 without escape codes
78  ISO_2022_IR_6 = 32, // US_ASCII
79  ISO_2022_IR_13 = 33, // JIS X 0201, japanese katakana
80  ISO_2022_IR_87 = 34, // JIS X 0208, japanese 94x94 primary
81  ISO_2022_IR_159 = 36, // JIS X 0212, japanese 94x94 secondary
82  ISO_2022_IR_100 = 40, // ISO-8859-1, latin1, western europe
83  ISO_2022_IR_101 = 41, // ISO-8859-2, latin2, central europe
84  ISO_2022_IR_109 = 42, // ISO-8859-3, latin3, maltese
85  ISO_2022_IR_110 = 43, // ISO-8859-4, latin4, baltic
86  ISO_2022_IR_144 = 44, // ISO-8859-5, cyrillic
87  ISO_2022_IR_127 = 45, // ISO-8859-6, arabic
88  ISO_2022_IR_126 = 46, // ISO-8859-7, greek
89  ISO_2022_IR_138 = 47, // ISO-8859-8, hebrew
90  ISO_2022_IR_148 = 48, // ISO-8859-9, latin5, turkish
91  ISO_2022_IR_166 = 50, // ISO-8859-11, thai
92  ISO_2022_IR_149 = 56, // the KS X 1001 part of ISO-2022-KR
93  ISO_2022_IR_58 = 57, // the GB2312 part of ISO-2022-CN
94  ISO_IR_192 = 64, // UTF-8, unicode
95  GB18030 = 65, // gb18030, chinese with full unicode mapping
96  GBK = 66, // gbk, chinese
97  X_BIG5 = 67, // big5 + ETEN, traditional chinese
98  X_EUCJP = 69, // euc-jp, unix encoding for japanese
99  X_SJIS = 70, // windows-31j, aka shift-jis, code page 932
100  X_CP874 = 76, // cp1162, thai (windows-874)
101  X_CP1250 = 80, // cp1250, central europe
102  X_CP1251 = 81, // cp1251, cyrillic
103  X_CP1252 = 82, // cp1252, western europe
104  X_CP1253 = 83, // cp1253, greek
105  X_CP1254 = 84, // cp1254, turkish
106  X_CP1255 = 85, // cp1255, hebrew
107  X_CP1256 = 86, // cp1256, arabic
108  X_CP1257 = 87, // cp1257, baltic rim
109  X_KOI8 = 90, // koi8, cyrillic
110  Unknown = 255 // signifies unknown character set
111  };
112 
114  vtkDICOMCharacterSet() : Key(0) {}
116 
118 
127  vtkDICOMCharacterSet(int k) : Key(static_cast<unsigned char>(k)) {}
128 
130 
134  explicit vtkDICOMCharacterSet(const std::string& name) {
135  this->Key = KeyFromString(name.data(), name.length()); }
136  vtkDICOMCharacterSet(const char *name, size_t nl) {
137  this->Key = KeyFromString(name, nl); }
139 
141 
150  GlobalDefault = cs.GetKey(); }
151  static vtkDICOMCharacterSet GetGlobalDefault() {
152  return GlobalDefault; }
153 
155 
160  static void SetGlobalOverride(bool b) {
161  GlobalOverride = b; }
162  static void GlobalOverrideOn() { GlobalOverride = true; }
163  static void GlobalOverrideOff() { GlobalOverride = false; }
164  static bool GetGlobalOverride() { return GlobalOverride; }
166 
168 
179  std::string GetCharacterSetString() const;
180 
182  unsigned char GetKey() const { return this->Key; }
184 
186 
195  std::string FromUTF8(const char *text, size_t l, size_t *lp=0) const;
196  std::string FromUTF8(const std::string& text) const {
197  return FromUTF8(text.data(), text.length()); }
198 
200 
211  std::string ToUTF8(const char *text, size_t l, size_t *lp=0) const;
212  std::string ToUTF8(const std::string& text) const {
213  return ToUTF8(text.data(), text.length()); }
214 
216  std::string ConvertToUTF8(const char *text, size_t l) const;
217 
219 
224  std::string ToSafeUTF8(const char *text, size_t l) const;
225  std::string ToSafeUTF8(const std::string& text) const {
226  return ToSafeUTF8(text.data(), text.length()); }
227 
229 
237  std::string CaseFoldedUTF8(const char *text, size_t l) const;
238  std::string CaseFoldedUTF8(const std::string& text) const {
239  return CaseFoldedUTF8(text.data(), text.length()); }
240 
242 
246  bool IsISO2022() const {
247  return ((this->Key & ISO_2022_MAX) == (this->Key | ISO_2022));
248  }
249 
251  bool IsISO8859() const {
252  return (this->Key >= ISO_IR_100 && this->Key <= X_LATIN10);
253  }
254 
256 
262  bool IsBiDirectional() const {
263  return (this->Key == ISO_IR_127 ||
264  this->Key == ISO_IR_138 ||
265  this->Key == X_CP1255 ||
266  this->Key == X_CP1256); }
268 
270 
276  unsigned int CountBackslashes(const char *text, size_t l) const;
277 
279 
283  size_t NextBackslash(const char *text, const char *end) const;
285 
287  bool operator==(vtkDICOMCharacterSet b) const { return (this->Key == b.Key); }
288  bool operator!=(vtkDICOMCharacterSet b) const { return (this->Key != b.Key); }
289  bool operator<=(vtkDICOMCharacterSet a) const { return (this->Key <= a.Key); }
290  bool operator>=(vtkDICOMCharacterSet a) const { return (this->Key >= a.Key); }
291  bool operator<(vtkDICOMCharacterSet a) const { return (this->Key < a.Key); }
292  bool operator>(vtkDICOMCharacterSet a) const { return (this->Key > a.Key); }
294 
295 private:
296 
297  // ISO-2022 Escape Codes
298  enum EscapeType {
299  CODE_ACS, // Announcer Code Sequence
300  CODE_CZD, // C0 Designate
301  CODE_C1D, // C1 Designate
302  CODE_GZD, // G0 Designate
303  CODE_G1D, // G1 Designate
304  CODE_G2D, // G2 Designate
305  CODE_G3D, // G3 Designate
306  CODE_DOCS, // Designate Other Coding System
307  CODE_CMD, // Coding Method Delimiter
308  CODE_IRR, // Identify Revised Registration
309  CODE_SS2, // Single Shift Two
310  CODE_SS3, // Single Shift Three
311  CODE_LS2, // Locking Shift Two
312  CODE_LS3, // Locking Shift Three
313  CODE_LS1R, // Locking Shift One Right
314  CODE_LS2R, // Locking Shift Two Right
315  CODE_LS3R, // Locking Shift Three Right
316  CODE_OTHER = 254, // Unrecognized
317  CODE_ERROR = 255 // Failure indicator
318  };
319 
320  // ISO-2022 State Bitfield
321  enum StateType {
322  ALTERNATE_CS = 0x00FF,
323  MULTIBYTE_G0 = 0x0100,
324  MULTIBYTE_G1 = 0x0200,
325  MULTIBYTE_G2 = 0x0400,
326  MULTIBYTE_G3 = 0x0800,
327  CHARSET96_GX = 0x1000,
328  CHARSET96_G1 = 0x2000,
329  CHARSET96_G2 = 0x4000,
330  CHARSET96_G3 = 0x8000
331  };
332 
333  // Other ISO-2022
334  enum {
335  ISO_2022_JP_BASE = 7,
336  ISO_2022_BASE = 31,
337  ISO_2022 = 32,
338  ISO_2022_MAX = 63
339  };
340 
341  size_t AnyToUTF8(const char *t, size_t l, std::string *s, int m) const;
342  size_t UTF8ToSingleByte(const char *t, size_t l, std::string *s) const;
343  size_t SingleByteToUTF8(const char *t, size_t l, std::string *s, int m) const;
344  size_t ISO8859ToUTF8(const char *t, size_t l, std::string *s, int) const;
345  size_t UTF8ToISO2022(const char *t, size_t l, std::string *s) const;
346  size_t ISO2022ToUTF8(const char *t, size_t l, std::string *s, int m) const;
347  static size_t UTF8ToEUCKR(const char *t, size_t l, std::string *s);
348  static size_t EUCKRToUTF8(const char *t, size_t l, std::string *s, int m);
349  static size_t UTF8ToGB2312(const char *t, size_t l, std::string *s);
350  static size_t GB2312ToUTF8(const char *t, size_t l, std::string *s, int m);
351  static size_t UTF8ToGB18030(const char *t, size_t l, std::string *s);
352  static size_t GB18030ToUTF8(const char *t, size_t l, std::string *s, int m);
353  static size_t UTF8ToGBK(const char *t, size_t l, std::string *s);
354  static size_t GBKToUTF8(const char *t, size_t l, std::string *s, int m);
355  static size_t UTF8ToBig5(const char *t, size_t l, std::string *s);
356  static size_t Big5ToUTF8(const char *t, size_t l, std::string *s, int m);
357  static size_t UTF8ToEUCJP(const char *t, size_t l, std::string *s);
358  static size_t EUCJPToUTF8(const char *t, size_t l, std::string *s, int m);
359  static size_t UTF8ToSJIS(const char *t, size_t l, std::string *s);
360  static size_t SJISToUTF8(const char *t, size_t l, std::string *s, int m);
361  static size_t UTF8ToJISX(
362  int charset, const char *t, size_t l, std::string *s);
363  static size_t JISXToUTF8(
364  int csGL, int csGR, const char *t, size_t l, std::string *s, int m);
365 
366  static unsigned int InitISO2022(unsigned char key, unsigned char G[4]);
367  static EscapeType EscapeCode(const char *cp, size_t l, unsigned int *state);
368  static unsigned char CharacterSetFromEscapeCode(const char *code, size_t l);
369  static unsigned char KeyFromString(const char *name, size_t nl);
370 
371  unsigned char Key;
372 
373  static unsigned char GlobalDefault;
374  static bool GlobalOverride;
375 
376  static const unsigned short *Table[256];
377  static const unsigned short *Reverse[256];
378 };
379 
380 VTKDICOM_EXPORT ostream& operator<<(ostream& o, const vtkDICOMCharacterSet& a);
381 
382 #endif /* vtkDICOMCharacterSet_h */
383 // VTK-HeaderTest-Exclude: vtkDICOMCharacterSet.h
Character sets.
Definition: vtkDICOMCharacterSet.h:55
bool IsISO8859() const
Returns true if this uses an ISO 8859 code page.
Definition: vtkDICOMCharacterSet.h:251
bool IsISO2022() const
Returns true if ISO 2022 escape codes are used.
Definition: vtkDICOMCharacterSet.h:246
bool IsBiDirectional() const
Check for bidirectional character sets.
Definition: vtkDICOMCharacterSet.h:262
std::string ConvertToUTF8(const char *text, size_t l) const
Obsolete method for converting to UTF8.
vtkDICOMCharacterSet(const std::string &name)
Construct a character set object from a SpecificCharacterSet value.
Definition: vtkDICOMCharacterSet.h:134
std::string ToUTF8(const char *text, size_t l, size_t *lp=0) const
Convert text from this encoding to UTF-8.
std::string ToSafeUTF8(const char *text, size_t l) const
Convert text to UTF-8 that is safe to print to the console.
std::string GetCharacterSetString() const
Generate SpecificCharacterSet code values (diagnostic only).
size_t NextBackslash(const char *text, const char *end) const
Get the offset to the next backslash, or to the end of the string.
vtkDICOMCharacterSet(int k)
Construct a character set object from a given code.
Definition: vtkDICOMCharacterSet.h:127
std::string FromUTF8(const char *text, size_t l, size_t *lp=0) const
Convert text from UTF-8 to this encoding.
unsigned char GetKey() const
Get the numerical code for this character set object.
Definition: vtkDICOMCharacterSet.h:182
std::string CaseFoldedUTF8(const char *text, size_t l) const
Convert text into a form suitable for case-insensitive matching.
static void SetGlobalDefault(vtkDICOMCharacterSet cs)
Set the character set to use if SpecificCharacterSet is missing.
Definition: vtkDICOMCharacterSet.h:149
static void SetGlobalOverride(bool b)
Override the value stored in SpecificCharacterSet with the default.
Definition: vtkDICOMCharacterSet.h:160
unsigned int CountBackslashes(const char *text, size_t l) const
Count the number of backslashes in an encoded string.