00001
00002
00003
00004
00005
00006
00007
00008
00009 #include <stdlib.h>
00010 #include <string.h>
00011 #include <ctype.h>
00012 #if defined(__STDC_ISO_10646__)
00013 #include <wctype.h>
00014 #endif
00015 #include "antiword.h"
00016
00017 static const USHORT usCp850[] = {
00018 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
00019 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
00020 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
00021 0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192,
00022 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
00023 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
00024 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x00c0,
00025 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, 0x2510,
00026 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3,
00027 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
00028 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce,
00029 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
00030 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
00031 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
00032 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
00033 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0,
00034 };
00035
00036 static const USHORT usCp1250[] = {
00037 0x20ac, 0x003f, 0x201a, 0x003f, 0x201e, 0x2026, 0x2020, 0x2021,
00038 0x003f, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
00039 0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
00040 0x003f, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
00041 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,
00042 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
00043 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
00044 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
00045 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
00046 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
00047 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
00048 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
00049 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
00050 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
00051 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
00052 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
00053 };
00054
00055 static const USHORT usCp1251[] = {
00056 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
00057 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
00058 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
00059 0x00f3, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
00060 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,
00061 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,
00062 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,
00063 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,
00064 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
00065 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
00066 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
00067 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
00068 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
00069 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
00070 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
00071 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
00072 };
00073
00074 static const USHORT usCp1252[] = {
00075 0x20ac, 0x003f, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
00076 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x003f, 0x017d, 0x003f,
00077 0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
00078 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x003f, 0x017e, 0x0178,
00079 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
00080 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
00081 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
00082 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
00083 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
00084 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
00085 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
00086 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
00087 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
00088 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
00089 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
00090 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
00091 };
00092
00093 static const USHORT usMacRoman[] = {
00094 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,
00095 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,
00096 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,
00097 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,
00098 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,
00099 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,
00100 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,
00101 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x2126, 0x00e6, 0x00f8,
00102 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,
00103 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,
00104 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,
00105 0x00ff, 0x0178, 0x2044, 0x00a4, 0x2039, 0x203a, 0xfb01, 0xfb02,
00106 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,
00107 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,
00108 0x003f, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,
00109 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
00110 };
00111
00112 static const USHORT usPrivateArea[] = {
00113 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220d,
00114 0x0028, 0x0029, 0x2217, 0x002b, 0x002c, 0x2212, 0x002e, 0x002f,
00115 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
00116 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x2019, 0x003e, 0x003f,
00117 0x201d, 0x201c, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
00118 0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
00119 0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
00120 0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0x005f,
00121 0x003f, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03c6, 0x03b3,
00122 0x03b7, 0x03b9, 0x03d5, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
00123 0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
00124 0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x223c, 0x003f,
00125 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00126 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00127 0x003f, 0x003f, 0x003f, 0x2022, 0x003f, 0x003f, 0x003f, 0x003f,
00128 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00129 0x20ac, 0x03d2, 0x2032, 0x2264, 0x2044, 0x221e, 0x0192, 0x2663,
00130 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
00131 0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022,
00132 0x00f7, 0x2260, 0x2261, 0x2248, 0x2026, 0x007c, 0x23af, 0x21b5,
00133 0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
00134 0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
00135 0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5,
00136 0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3,
00137 0x22c4, 0x3008, 0x00ae, 0x00a9, 0x2122, 0x2211, 0x239b, 0x239c,
00138 0x239d, 0x23a1, 0x23a2, 0x23a3, 0x23a7, 0x23a8, 0x23a9, 0x23aa,
00139 0x003f, 0x3009, 0x222b, 0x2320, 0x23ae, 0x2321, 0x239e, 0x239f,
00140 0x23a0, 0x23a4, 0x23a5, 0x23a6, 0x23ab, 0x23ac, 0x23ad, 0x003f,
00141 };
00142
00143 typedef struct char_table_tag {
00144 UCHAR ucLocal;
00145 USHORT usUnicode;
00146 } char_table_type;
00147
00148 static char_table_type atCharTable[256];
00149 static size_t tNextPosFree = 0;
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159 static int
00160 iCompare(const void *pvRecord1, const void *pvRecord2)
00161 {
00162 USHORT usUnicode1, usUnicode2;
00163
00164 usUnicode1 = ((char_table_type *)pvRecord1)->usUnicode;
00165 usUnicode2 = ((char_table_type *)pvRecord2)->usUnicode;
00166
00167 if (usUnicode1 < usUnicode2) {
00168 return -1;
00169 }
00170 if (usUnicode1 > usUnicode2) {
00171 return 1;
00172 }
00173 return 0;
00174 }
00175
00176
00177
00178
00179
00180
00181 static const char_table_type *
00182 pGetCharTableRecord(USHORT usUnicode)
00183 {
00184 char_table_type tKey;
00185
00186 if (tNextPosFree == 0) {
00187 return NULL;
00188 }
00189 tKey.usUnicode = usUnicode;
00190 tKey.ucLocal = 0;
00191 return (char_table_type *)bsearch(&tKey,
00192 atCharTable,
00193 tNextPosFree, sizeof(atCharTable[0]),
00194 iCompare);
00195 }
00196
00197
00198
00199
00200 UCHAR
00201 ucGetBulletCharacter(conversion_type eConversionType, encoding_type eEncoding)
00202 {
00203 #if defined(__riscos)
00204 return 0x8f;
00205 #else
00206 const char_table_type *pRec;
00207
00208 fail(eEncoding == encoding_utf_8);
00209
00210 if (eEncoding == encoding_latin_1 &&
00211 (eConversionType == conversion_ps ||
00212 eConversionType == conversion_pdf)) {
00213
00214 return (UCHAR)143;
00215 }
00216 if (eConversionType != conversion_text &&
00217 eConversionType != conversion_fmt_text) {
00218 pRec = pGetCharTableRecord(UNICODE_BULLET);
00219 if (pRec != NULL) {
00220 return pRec->ucLocal;
00221 }
00222 pRec = pGetCharTableRecord(UNICODE_BULLET_OPERATOR);
00223 if (pRec != NULL) {
00224 return pRec->ucLocal;
00225 }
00226 pRec = pGetCharTableRecord(UNICODE_MIDDLE_DOT);
00227 if (pRec != NULL) {
00228 return pRec->ucLocal;
00229 }
00230 }
00231 return (UCHAR)'.';
00232 #endif
00233 }
00234
00235
00236
00237
00238 UCHAR
00239 ucGetNbspCharacter(void)
00240 {
00241 const char_table_type *pRec;
00242
00243 pRec = pGetCharTableRecord(0x00a0);
00244 if (pRec == NULL) {
00245 DBG_MSG("Non-breaking space record not found");
00246
00247 return (UCHAR)0xa0;
00248 }
00249 return pRec->ucLocal;
00250 }
00251
00252
00253
00254
00255
00256
00257
00258
00259 BOOL
00260 bReadCharacterMappingTable(FILE *pFile)
00261 {
00262 char *pcTmp;
00263 ULONG ulUnicode;
00264 UINT uiLocal;
00265 int iFields;
00266 char szLine[81];
00267
00268 if (pFile == NULL) {
00269 return FALSE;
00270 }
00271
00272
00273 (void)memset(atCharTable, 0, sizeof(atCharTable));
00274
00275
00276 while (fgets(szLine, (int)sizeof(szLine), pFile)) {
00277 if (szLine[0] == '#' ||
00278 szLine[0] == '\r' ||
00279 szLine[0] == '\n') {
00280
00281 continue;
00282 }
00283 iFields = sscanf(szLine, "%x %lx %*s", &uiLocal, &ulUnicode);
00284 if (iFields != 2) {
00285 pcTmp = strchr(szLine, '\r');
00286 if (pcTmp != NULL) {
00287 *pcTmp = '\0';
00288 }
00289 pcTmp = strchr(szLine, '\n');
00290 if (pcTmp != NULL) {
00291 *pcTmp = '\0';
00292 }
00293 werr(0, "Syntax error in: '%s'", szLine);
00294 continue;
00295 }
00296 if (uiLocal > 0xff || ulUnicode > 0xffff) {
00297 werr(0, "Syntax error in: '%02x %04lx'",
00298 uiLocal, ulUnicode);
00299 continue;
00300 }
00301
00302 if (uiLocal != ulUnicode || uiLocal >= 0x80) {
00303 atCharTable[tNextPosFree].ucLocal = (UCHAR)uiLocal;
00304 atCharTable[tNextPosFree].usUnicode = (USHORT)ulUnicode;
00305 tNextPosFree++;
00306 }
00307 if (tNextPosFree >= elementsof(atCharTable)) {
00308 werr(0, "Too many entries in the character mapping "
00309 "file. Ignoring the rest.");
00310 break;
00311 }
00312 }
00313
00314 if (tNextPosFree != 0) {
00315 DBG_HEX(atCharTable[0].usUnicode);
00316 DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
00317
00318 qsort(atCharTable,
00319 tNextPosFree, sizeof(atCharTable[0]),
00320 iCompare);
00321
00322 DBG_HEX(atCharTable[0].usUnicode);
00323 DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
00324 }
00325
00326 return TRUE;
00327 }
00328
00329
00330
00331
00332
00333
00334
00335
00336 ULONG
00337 ulTranslateCharacters(USHORT usChar, ULONG ulFileOffset, int iWordVersion,
00338 conversion_type eConversionType, encoding_type eEncoding,
00339 BOOL bUseMacCharSet)
00340 {
00341 const char_table_type *pTmp;
00342 const USHORT *usCharSet;
00343
00344 usCharSet = NULL;
00345 if (bUseMacCharSet) {
00346
00347 usCharSet = usMacRoman;
00348 } else if (iWordVersion == 0) {
00349
00350 usCharSet = usCp850;
00351 } else {
00352
00353 switch (eEncoding) {
00354 case encoding_latin_2:
00355 usCharSet = usCp1250;
00356 break;
00357 case encoding_cyrillic:
00358 usCharSet = usCp1251;
00359 break;
00360 case encoding_latin_1:
00361 default:
00362 usCharSet = usCp1252;
00363 break;
00364 }
00365 }
00366 fail(usCharSet == NULL);
00367 if (usChar >= 0x80 && usChar <= 0x9f) {
00368
00369 usChar = usCharSet[usChar - 0x80];
00370 } else if (iWordVersion < 8 && usChar >= 0xa0 && usChar <= 0xff) {
00371
00372 usChar = usCharSet[usChar - 0x80];
00373 }
00374
00375
00376 if (usChar >= 0xf020 && usChar <= 0xf0ff) {
00377 DBG_HEX_C(usPrivateArea[usChar - 0xf020] == 0x003f, usChar);
00378 usChar = usPrivateArea[usChar - 0xf020];
00379 }
00380
00381
00382 switch (usChar) {
00383 case IGNORE_CHARACTER:
00384 case FOOTNOTE_SEPARATOR:
00385 case FOOTNOTE_CONTINUATION:
00386 case ANNOTATION:
00387 case FRAME:
00388 case LINE_FEED:
00389 case WORD_SOFT_HYPHEN:
00390 case UNICODE_HYPHENATION_POINT:
00391 return IGNORE_CHARACTER;
00392 case PICTURE:
00393 case TABLE_SEPARATOR:
00394 case TAB:
00395 case HARD_RETURN:
00396 case PAGE_BREAK:
00397 case PAR_END:
00398 case COLUMN_FEED:
00399 return (ULONG)usChar;
00400 case FOOTNOTE_OR_ENDNOTE:
00401 NO_DBG_HEX(ulFileOffset);
00402 switch (eGetNotetype(ulFileOffset)) {
00403 case notetype_is_footnote:
00404 return FOOTNOTE_CHAR;
00405 case notetype_is_endnote:
00406 return ENDNOTE_CHAR;
00407 default:
00408 return UNKNOWN_NOTE_CHAR;
00409 }
00410 case WORD_UNBREAKABLE_JOIN:
00411 return (ULONG)OUR_UNBREAKABLE_JOIN;
00412 default:
00413 break;
00414 }
00415
00416 if (eEncoding != encoding_utf_8) {
00417
00418 if (usChar >= 0xff01 && usChar <= 0xff5e) {
00419 usChar -= 0xfee0;
00420 }
00421 }
00422
00423 if (eEncoding == encoding_latin_1 &&
00424 (eConversionType == conversion_ps ||
00425 eConversionType == conversion_pdf)) {
00426
00427 switch (usChar) {
00428 case UNICODE_ELLIPSIS:
00429 return 140;
00430 case UNICODE_TRADEMARK_SIGN:
00431 return 141;
00432 case UNICODE_PER_MILLE_SIGN:
00433 return 142;
00434 case UNICODE_BULLET:
00435 case UNICODE_BULLET_OPERATOR:
00436 case UNICODE_BLACK_CLUB_SUIT:
00437 return 143;
00438 case UNICODE_LEFT_SINGLE_QMARK:
00439 return 144;
00440 case UNICODE_RIGHT_SINGLE_QMARK:
00441 return 145;
00442 case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
00443 return 146;
00444 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
00445 return 147;
00446 case UNICODE_LEFT_DOUBLE_QMARK:
00447 return 148;
00448 case UNICODE_RIGHT_DOUBLE_QMARK:
00449 return 149;
00450 case UNICODE_DOUBLE_LOW_9_QMARK:
00451 return 150;
00452 case UNICODE_EN_DASH:
00453 return 151;
00454 case UNICODE_EM_DASH:
00455 return 152;
00456 case UNICODE_MINUS_SIGN:
00457 return 153;
00458 case UNICODE_CAPITAL_LIGATURE_OE:
00459 return 154;
00460 case UNICODE_SMALL_LIGATURE_OE:
00461 return 155;
00462 case UNICODE_DAGGER:
00463 return 156;
00464 case UNICODE_DOUBLE_DAGGER:
00465 return 157;
00466 case UNICODE_SMALL_LIGATURE_FI:
00467 return 158;
00468 case UNICODE_SMALL_LIGATURE_FL:
00469 return 159;
00470 default:
00471 break;
00472 }
00473 }
00474
00475 if (eConversionType == conversion_pdf) {
00476 if (eEncoding == encoding_latin_1) {
00477 switch (usChar) {
00478 case UNICODE_EURO_SIGN:
00479 return 128;
00480 default:
00481 break;
00482 }
00483 } else if (eEncoding == encoding_latin_2) {
00484 switch (usChar) {
00485 case UNICODE_CAPITAL_D_WITH_STROKE:
00486 case UNICODE_SMALL_D_WITH_STROKE:
00487 return 0x3f;
00488 default:
00489 break;
00490 }
00491 }
00492 }
00493
00494 if (usChar < 0x80) {
00495
00496 if (usChar < 0x20 || usChar == 0x7f) {
00497
00498 DBG_HEX(usChar);
00499 DBG_FIXME();
00500 return IGNORE_CHARACTER;
00501 }
00502 return (ULONG)usChar;
00503 }
00504
00505 if (eEncoding == encoding_utf_8) {
00506
00507 return (ULONG)usChar;
00508 }
00509
00510
00511 pTmp = pGetCharTableRecord(usChar);
00512 if (pTmp != NULL) {
00513 DBG_HEX_C(usChar >= 0x7f && usChar <= 0x9f, usChar);
00514 return (ULONG)pTmp->ucLocal;
00515 }
00516
00517
00518 switch (usChar) {
00519 case UNICODE_SMALL_F_HOOK:
00520 return (ULONG)'f';
00521 case UNICODE_GREEK_CAPITAL_CHI:
00522 return (ULONG)'X';
00523 case UNICODE_GREEK_SMALL_UPSILON:
00524 return (ULONG)'v';
00525 case UNICODE_MODIFIER_CIRCUMFLEX:
00526 case UNICODE_UPWARDS_ARROW:
00527 return (ULONG)'^';
00528 case UNICODE_SMALL_TILDE:
00529 case UNICODE_TILDE_OPERATOR:
00530 return (ULONG)'~';
00531 case UNICODE_EN_QUAD:
00532 case UNICODE_EM_QUAD:
00533 case UNICODE_EN_SPACE:
00534 case UNICODE_EM_SPACE:
00535 case UNICODE_THREE_PER_EM_SPACE:
00536 case UNICODE_FOUR_PER_EM_SPACE:
00537 case UNICODE_SIX_PER_EM_SPACE:
00538 case UNICODE_FIGURE_SPACE:
00539 case UNICODE_PUNCTUATION_SPACE:
00540 case UNICODE_THIN_SPACE:
00541 case UNICODE_NARROW_NO_BREAK_SPACE:
00542 case UNICODE_LIGHT_SHADE:
00543 case UNICODE_MEDIUM_SHADE:
00544 case UNICODE_DARK_SHADE:
00545 return (ULONG)' ';
00546 case UNICODE_LEFT_DOUBLE_QMARK:
00547 case UNICODE_RIGHT_DOUBLE_QMARK:
00548 case UNICODE_DOUBLE_LOW_9_QMARK:
00549 case UNICODE_DOUBLE_HIGH_REV_9_QMARK:
00550 case UNICODE_DOUBLE_PRIME:
00551 return (ULONG)'"';
00552 case UNICODE_LEFT_SINGLE_QMARK:
00553 case UNICODE_RIGHT_SINGLE_QMARK:
00554 case UNICODE_SINGLE_LOW_9_QMARK:
00555 case UNICODE_SINGLE_HIGH_REV_9_QMARK:
00556 case UNICODE_PRIME:
00557 return (ULONG)'\'';
00558 case UNICODE_HYPHEN:
00559 case UNICODE_NON_BREAKING_HYPHEN:
00560 case UNICODE_FIGURE_DASH:
00561 case UNICODE_EN_DASH:
00562 case UNICODE_EM_DASH:
00563 case UNICODE_HORIZONTAL_BAR:
00564 case UNICODE_MINUS_SIGN:
00565 case UNICODE_BD_LIGHT_HORIZONTAL:
00566 case UNICODE_BD_DOUBLE_HORIZONTAL:
00567 return (ULONG)'-';
00568 case UNICODE_DOUBLE_VERTICAL_LINE:
00569 case UNICODE_BD_LIGHT_VERTICAL:
00570 case UNICODE_BD_DOUBLE_VERTICAL:
00571 return (ULONG)'|';
00572 case UNICODE_DOUBLE_LOW_LINE:
00573 return (ULONG)'_';
00574 case UNICODE_DAGGER:
00575 return (ULONG)'+';
00576 case UNICODE_DOUBLE_DAGGER:
00577 return (ULONG)'#';
00578 case UNICODE_BULLET:
00579 case UNICODE_BULLET_OPERATOR:
00580 case UNICODE_BLACK_CLUB_SUIT:
00581 return (ULONG)ucGetBulletCharacter(eConversionType, eEncoding);
00582 case UNICODE_ONE_DOT_LEADER:
00583 case UNICODE_TWO_DOT_LEADER:
00584 return (ULONG)'.';
00585 case UNICODE_ELLIPSIS:
00586 #if defined(__riscos)
00587 return (ULONG)OUR_ELLIPSIS;
00588 #else
00589 if (ulFileOffset == 0) {
00590 return (ULONG)OUR_ELLIPSIS;
00591 }
00592 return UNICODE_ELLIPSIS;
00593 #endif
00594 case UNICODE_DOUBLE_LEFT_ANGLE_QMARK:
00595 case UNICODE_TRIANGULAR_BULLET:
00596 case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
00597 case UNICODE_LEFTWARDS_ARROW:
00598 return (ULONG)'<';
00599 case UNICODE_DOUBLE_RIGHT_ANGLE_QMARK:
00600 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
00601 case UNICODE_RIGHTWARDS_ARROW:
00602 return (ULONG)'>';
00603 case UNICODE_UNDERTIE:
00604 return (ULONG)'-';
00605 case UNICODE_N_ARY_SUMMATION:
00606 return (ULONG)'S';
00607 case UNICODE_EURO_SIGN:
00608 return (ULONG)'E';
00609 case UNICODE_CIRCLE:
00610 case UNICODE_SQUARE:
00611 return (ULONG)'O';
00612 case UNICODE_DIAMOND:
00613 return (ULONG)OUR_DIAMOND;
00614 case UNICODE_NUMERO_SIGN:
00615 return (ULONG)'N';
00616 case UNICODE_KELVIN_SIGN:
00617 return (ULONG)'K';
00618 case UNICODE_DOWNWARDS_ARROW:
00619 return (ULONG)'v';
00620 case UNICODE_FRACTION_SLASH:
00621 case UNICODE_DIVISION_SLASH:
00622 return (ULONG)'/';
00623 case UNICODE_ASTERISK_OPERATOR:
00624 return (ULONG)'*';
00625 case UNICODE_RATIO:
00626 return (ULONG)':';
00627 case UNICODE_BD_LIGHT_DOWN_RIGHT:
00628 case UNICODE_BD_LIGHT_DOWN_AND_LEFT:
00629 case UNICODE_BD_LIGHT_UP_AND_RIGHT:
00630 case UNICODE_BD_LIGHT_UP_AND_LEFT:
00631 case UNICODE_BD_LIGHT_VERTICAL_AND_RIGHT:
00632 case UNICODE_BD_LIGHT_VERTICAL_AND_LEFT:
00633 case UNICODE_BD_LIGHT_DOWN_AND_HORIZONTAL:
00634 case UNICODE_BD_LIGHT_UP_AND_HORIZONTAL:
00635 case UNICODE_BD_LIGHT_VERTICAL_AND_HORIZONTAL:
00636 case UNICODE_BD_DOUBLE_DOWN_AND_RIGHT:
00637 case UNICODE_BD_DOUBLE_DOWN_AND_LEFT:
00638 case UNICODE_BD_DOUBLE_UP_AND_RIGHT:
00639 case UNICODE_BD_DOUBLE_UP_AND_LEFT:
00640 case UNICODE_BD_DOUBLE_VERTICAL_AND_RIGHT:
00641 case UNICODE_BD_DOUBLE_VERTICAL_AND_LEFT:
00642 case UNICODE_BD_DOUBLE_DOWN_AND_HORIZONTAL:
00643 case UNICODE_BD_DOUBLE_UP_AND_HORIZONTAL:
00644 case UNICODE_BD_DOUBLE_VERTICAL_AND_HORIZONTAL:
00645 case UNICODE_BLACK_SQUARE:
00646 return (ULONG)'+';
00647 case UNICODE_HAIR_SPACE:
00648 case UNICODE_ZERO_WIDTH_SPACE:
00649 case UNICODE_ZERO_WIDTH_NON_JOINER:
00650 case UNICODE_ZERO_WIDTH_JOINER:
00651 case UNICODE_LEFT_TO_RIGHT_MARK:
00652 case UNICODE_RIGHT_TO_LEFT_MARK:
00653 case UNICODE_LEFT_TO_RIGHT_EMBEDDING:
00654 case UNICODE_RIGHT_TO_LEFT_EMBEDDING:
00655 case UNICODE_POP_DIRECTIONAL_FORMATTING:
00656 case UNICODE_LEFT_TO_RIGHT_OVERRIDE:
00657 case UNICODE_RIGHT_TO_LEFT_OVERRIDE:
00658 case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE:
00659 return IGNORE_CHARACTER;
00660 default:
00661 break;
00662 }
00663
00664 if (usChar == UNICODE_TRADEMARK_SIGN) {
00665
00666
00667
00668
00669 return IGNORE_CHARACTER;
00670 }
00671
00672 if (usChar >= 0xa0 && usChar <= 0xff) {
00673
00674 return (ULONG)usChar;
00675 }
00676
00677 DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, ulFileOffset);
00678 DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, usChar);
00679 DBG_MSG_C(usChar >= 0xe000 && usChar < 0xf900, "Private Use Area");
00680
00681
00682 return 0x3f;
00683 }
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693
00694
00695 ULONG
00696 ulToUpper(ULONG ulChar)
00697 {
00698 if (ulChar < 0x80) {
00699
00700 return (ULONG)toupper((int)ulChar);
00701 }
00702 if (ulChar >= 0xe0 && ulChar <= 0xfe && ulChar != 0xf7) {
00703
00704
00705
00706
00707
00708 return ulChar & ~0x20;
00709 }
00710 #if defined(__STDC_ISO_10646__)
00711
00712
00713
00714
00715 if (ulChar > 0xff) {
00716 return (ULONG)towupper((wint_t)ulChar);
00717 }
00718 #endif
00719 return ulChar;
00720 }