diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-05-30 19:04:51 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-05-30 19:04:51 +0000 |
commit | 850f165f4ac90a6e6687c392ddfdd0c6a05b3fe5 (patch) | |
tree | 7cac2f01468639c4ab63fe523c17d7638e8cd2ac /src | |
parent | 01af2a5e73f53ebf8a092e4bda77cd1326c1da11 (diff) |
Added hts_unescape_entities(), a rewrite of the HTML entities decoder.
Fixed HTML entities decoding which was done before charset decoding.
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile.am | 10 | ||||
-rw-r--r-- | src/Makefile.in | 22 | ||||
-rw-r--r-- | src/htscoremain.c | 17 | ||||
-rw-r--r-- | src/htsencoding.c | 176 | ||||
-rw-r--r-- | src/htsencoding.h | 55 | ||||
-rw-r--r-- | src/htsentities.h | 1535 | ||||
-rwxr-xr-x | src/htsentities.sh | 75 | ||||
-rw-r--r-- | src/htslib.c | 256 | ||||
-rw-r--r-- | src/htsparse.c | 8 |
9 files changed, 1885 insertions, 269 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index 0901f6d..59a6778 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -25,8 +25,8 @@ INCLUDES = \ bin_PROGRAMS = proxytrack httrack htsserver -httrack_LDADD = $(THREADS_LIBS) $(OPENSSL_LIBS) libhttrack.la -htsserver_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) $(OPENSSL_LIBS) libhttrack.la +httrack_LDADD = $(THREADS_LIBS) libhttrack.la +htsserver_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) libhttrack.la proxytrack_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) proxytrack_CFLAGS = $(AM_CFLAGS) -DNO_MALLOCT @@ -47,7 +47,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ htsname.c htsrobots.c htstools.c htswizard.c \ htsalias.c htsthread.c htsindex.c htsbauth.c \ htsmd5.c htszlib.c htswrap.c \ - htsmodules.c htscharset.c punycode.c \ + htsmodules.c htscharset.c punycode.c htsencoding.c \ md5.c \ htsmms.c \ minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \ @@ -60,7 +60,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ htsmodules.h htsname.h htsnet.h \ htsopt.h htsrobots.h htsthread.h \ htstools.h htswizard.h htswrap.h htszlib.h \ - htsstrings.h httrack-library.h htscharset.h punycode.h \ + htsstrings.h httrack-library.h htscharset.h punycode.h htsencoding.h \ md5.h \ htsmms.h \ minizip/crypt.h minizip/ioapi.h minizip/mztools.h minizip/unzip.h minizip/zip.h \ @@ -69,7 +69,7 @@ libhttrack_la_LIBADD = $(THREADS_LIBS) $(ZLIB_LIBS) $(OPENSSL_LIBS) $(DL_LIBS) $ libhttrack_la_LDFLAGS = -version-info $(VERSION_INFO) libhtsjava_la_SOURCES = htsjava.c htsjava.h -libhtsjava_la_LIBADD = $(THREADS_LIBS) $(OPENSSL_LIBS) $(DL_LIBS) libhttrack.la +libhtsjava_la_LIBADD = $(THREADS_LIBS) $(DL_LIBS) libhttrack.la libhtsjava_la_LDFLAGS = -version-info $(VERSION_INFO) EXTRA_DIST = httrack.h webhttrack \ diff --git a/src/Makefile.in b/src/Makefile.in index 30a1e93..9a943c1 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -67,7 +67,7 @@ libLTLIBRARIES_INSTALL = $(INSTALL) LTLIBRARIES = $(lib_LTLIBRARIES) am__DEPENDENCIES_1 = libhtsjava_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) libhttrack.la + $(am__DEPENDENCIES_1) libhttrack.la am_libhtsjava_la_OBJECTS = htsjava.lo libhtsjava_la_OBJECTS = $(am_libhtsjava_la_OBJECTS) libhttrack_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \ @@ -78,19 +78,18 @@ am_libhttrack_la_OBJECTS = htscore.lo htsparse.lo htsback.lo \ htsinthash.lo htshelp.lo htslib.lo htscoremain.lo htsname.lo \ htsrobots.lo htstools.lo htswizard.lo htsalias.lo htsthread.lo \ htsindex.lo htsbauth.lo htsmd5.lo htszlib.lo htswrap.lo \ - htsmodules.lo htscharset.lo punycode.lo md5.lo htsmms.lo \ - ioapi.lo mztools.lo unzip.lo zip.lo error.lo mms.lo + htsmodules.lo htscharset.lo punycode.lo htsencoding.lo md5.lo \ + htsmms.lo ioapi.lo mztools.lo unzip.lo zip.lo error.lo mms.lo libhttrack_la_OBJECTS = $(am_libhttrack_la_OBJECTS) binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) PROGRAMS = $(bin_PROGRAMS) am_htsserver_OBJECTS = htsserver.$(OBJEXT) htsweb.$(OBJEXT) htsserver_OBJECTS = $(am_htsserver_OBJECTS) htsserver_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) libhttrack.la + libhttrack.la httrack_SOURCES = httrack.c httrack_OBJECTS = httrack.$(OBJEXT) -httrack_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - libhttrack.la +httrack_DEPENDENCIES = $(am__DEPENDENCIES_1) libhttrack.la am_proxytrack_OBJECTS = proxytrack-main.$(OBJEXT) \ proxytrack-proxytrack.$(OBJEXT) proxytrack-store.$(OBJEXT) \ proxytrack-htsinthash.$(OBJEXT) proxytrack-htsmd5.$(OBJEXT) \ @@ -270,8 +269,8 @@ INCLUDES = \ -DDATADIR=\""$(datadir)"\" \ -DLIBDIR=\""$(libdir)"\" -httrack_LDADD = $(THREADS_LIBS) $(OPENSSL_LIBS) libhttrack.la -htsserver_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) $(OPENSSL_LIBS) libhttrack.la +httrack_LDADD = $(THREADS_LIBS) libhttrack.la +htsserver_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) libhttrack.la proxytrack_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) proxytrack_CFLAGS = $(AM_CFLAGS) -DNO_MALLOCT lib_LTLIBRARIES = libhttrack.la libhtsjava.la @@ -289,7 +288,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ htsname.c htsrobots.c htstools.c htswizard.c \ htsalias.c htsthread.c htsindex.c htsbauth.c \ htsmd5.c htszlib.c htswrap.c \ - htsmodules.c htscharset.c punycode.c \ + htsmodules.c htscharset.c punycode.c htsencoding.c \ md5.c \ htsmms.c \ minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \ @@ -302,7 +301,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ htsmodules.h htsname.h htsnet.h \ htsopt.h htsrobots.h htsthread.h \ htstools.h htswizard.h htswrap.h htszlib.h \ - htsstrings.h httrack-library.h htscharset.h punycode.h \ + htsstrings.h httrack-library.h htscharset.h punycode.h htsencoding.h \ md5.h \ htsmms.h \ minizip/crypt.h minizip/ioapi.h minizip/mztools.h minizip/unzip.h minizip/zip.h \ @@ -311,7 +310,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ libhttrack_la_LIBADD = $(THREADS_LIBS) $(ZLIB_LIBS) $(OPENSSL_LIBS) $(DL_LIBS) $(SOCKET_LIBS) $(ICONV_LIBS) libhttrack_la_LDFLAGS = -version-info $(VERSION_INFO) libhtsjava_la_SOURCES = htsjava.c htsjava.h -libhtsjava_la_LIBADD = $(THREADS_LIBS) $(OPENSSL_LIBS) $(DL_LIBS) libhttrack.la +libhtsjava_la_LIBADD = $(THREADS_LIBS) $(DL_LIBS) libhttrack.la libhtsjava_la_LDFLAGS = -version-info $(VERSION_INFO) EXTRA_DIST = httrack.h webhttrack \ minizip/ChangeLogUnzip \ @@ -468,6 +467,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htscharset.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htscore.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htscoremain.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htsencoding.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htsfilters.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htsftp.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htshash.Plo@am__quote@ diff --git a/src/htscoremain.c b/src/htscoremain.c index b918ed0..534c469 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -2342,6 +2342,23 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) { htsmain_free(); return 0; break; + case '6': // entities: httrack -#6 + if (++na < argc) { + char *const s = strdup(argv[na]); + if (s != NULL && hts_unescape_entities(s, s, strlen(s)) == 0) { + printf("%s\n", s); + free(s); + } else { + fprintf(stderr, "invalid string '%s'\n", argv[na]); + } + na += 1; + } else { + fprintf(stderr, + "Option #6 needs to be followed by a string"); + } + htsmain_free(); + return 0; + break; case '!': if (na + 1 >= argc) { HTS_PANIC_PRINTF diff --git a/src/htsencoding.c b/src/htsencoding.c new file mode 100644 index 0000000..46c57a4 --- /dev/null +++ b/src/htsencoding.c @@ -0,0 +1,176 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) Xavier Roche and other contributors + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 3 +of the License, or any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + +Please visit our Website: http://www.httrack.com +*/ + +/* ------------------------------------------------------------ */ +/* File: Encoding conversion functions */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + +#include "htscharset.h" +#include "htsencoding.h" + +/* static int decode_entity(const unsigned int hash, const size_t len); +*/ +#include "htsentities.h" + +/* hexadecimal conversion */ +static int get_hex_value(char c) { + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'a' && c <= 'f') + return (c - 'a' + 10); + else if (c >= 'A' && c <= 'F') + return (c - 'A' + 10); + else + return -1; +} + +/* Numerical Recipes, + see <http://en.wikipedia.org/wiki/Linear_congruential_generator> */ +#define HASH_PRIME ( 1664525 ) +#define HASH_CONST ( 1013904223 ) +#define HASH_ADD(HASH, C) do { \ + (HASH) *= HASH_PRIME; \ + (HASH) += HASH_CONST; \ + (HASH) += (C); \ + } while(0) + +int hts_unescape_entities(const char *src, char *dest, const size_t max) { + size_t i, j, ampStart, ampStartDest; + int uc; + int hex; + unsigned int hash; + for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, + uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) { + /* start of entity */ + if (src[i] == '&') { + ampStart = i; + ampStartDest = j; + hash = 0; + uc = -1; + } + /* inside a potential entity */ + else if (ampStart != (size_t) -1) { + /* &#..; entity */ + if (ampStart + 1 == i && src[ampStart + 1] == '#') { + uc = 0; + hex = 0; + } + /* &#x..; entity */ + else if (ampStart + 2 == i && src[ampStart + 1] == '#' + && src[ampStart + 2] == 'x') { + hex = 1; + } + /* end of entity */ + else if (src[i] == ';') { + size_t len; + + /* decode entity */ + if (uc == -1) { + /* &foo; */ + uc = decode_entity(hash, /*&src[ampStart + 1],*/ + i - ampStart - 1); + /* FIXME: TEMPORARY HACK FROM PREVIOUS VERSION TO BE INVESTIGATED */ + if (uc == 160) { + uc = 32; + } + } + + /* end */ + ampStart = (size_t) -1; + + /* success ? */ + if (uc > 0) { + /* write at position */ + len = hts_writeUTF8(uc, &dest[ampStartDest], max - ampStartDest); + if (len > 0) { + /* new dest position */ + j = ampStartDest + len; + } + /* do not copy ; */ + continue; + } + } + /* numerical entity */ + else if (uc != -1) { + /* decimal */ + if (!hex) { + if (src[i] >= '0' && src[i] <= '9') { + const int h = src[i] - '0'; + uc *= 10; + uc += h; + } else { + /* abandon */ + ampStart = (size_t) -1; + } + } + /* hex */ + else { + const int h = get_hex_value(src[i]); + if (h != -1) { + uc *= 16; + uc += h; + } else { + /* abandon */ + ampStart = (size_t) -1; + } + } + } + /* alphanumerical entity */ + else { + /* alphanum and not too far ('ϑ' is the longest) */ + if (i <= ampStart + 10 && + ( + (src[i] >= '0' && src[i] <= '9') + || (src[i] >= 'A' && src[i] <= 'Z') + || (src[i] >= 'a' && src[i] <= 'z') + ) + ) { + /* compute hash */ + HASH_ADD(hash, (unsigned char) src[i]); + } else { + /* abandon */ + ampStart = (size_t) -1; + } + } + } + + /* copy */ + if (j + 1 > max) { + /* overflow */ + return -1; + } + if (src != dest || i != j) { + dest[j] = src[i]; + } + j++; + } + dest[j] = '\0'; + + return 0; +} diff --git a/src/htsencoding.h b/src/htsencoding.h new file mode 100644 index 0000000..4dfd367 --- /dev/null +++ b/src/htsencoding.h @@ -0,0 +1,55 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) Xavier Roche and other contributors + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 3 +of the License, or any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + +Please visit our Website: http://www.httrack.com +*/ + +/* ------------------------------------------------------------ */ +/* File: Encoding conversion functions */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + +#ifndef HTS_CHARSET_DEFH +#define HTS_CHARSET_DEFH + +/** Standard includes. **/ +#include <stdlib.h> +#include <string.h> +#ifdef _WIN32 +#include <windows.h> +#endif + +/** + * Unescape HTML entities (as per HTML 4.0 Specification) + * and replace them in-place by their UTF-8 equivalents. + * Note: source and destination may be the same, and the destination only + * needs to hold as space as the source. + * Returns 0 upon success. + **/ +extern int hts_unescape_entities(const char *src, + char *dest, const size_t max); + +#endif + diff --git a/src/htsentities.h b/src/htsentities.h new file mode 100644 index 0000000..bbb3a06 --- /dev/null +++ b/src/htsentities.h @@ -0,0 +1,1535 @@ +/* + -- htsentities.h -- + FILE GENERATED BY ./htsentities.sh, DO NOT MODIFY + + We compute the LCG hash + (see <http://en.wikipedia.org/wiki/Linear_congruential_generator>) + for each entity. We should in theory check using strncmp() that we + actually have the correct entity, but this is actually statistically + not needed. + + We may want to do better, but we expect the hash function to be uniform, and + let the compiler be smart enough to optimize the switch (for example by + checking in log2() intervals) + + This code has been generated using the evil ./htsentities.sh script. +*/ + +static int decode_entity(const unsigned int hash, const size_t len) { + switch(hash) { + /* nbsp 160 no-break space = non-breaking space, */ + case 3948425267: + if (len == 4 /* && strncmp(ent, "nbsp") == 0 */) { + return 160; + } + break; + /* iexcl 161 inverted exclamation mark, U+00A1 ISOnum */ + case 1499591408: + if (len == 5 /* && strncmp(ent, "iexcl") == 0 */) { + return 161; + } + break; + /* cent 162 cent sign, U+00A2 ISOnum */ + case 2824786826: + if (len == 4 /* && strncmp(ent, "cent") == 0 */) { + return 162; + } + break; + /* pound 163 pound sign, U+00A3 ISOnum */ + case 805305925: + if (len == 5 /* && strncmp(ent, "pound") == 0 */) { + return 163; + } + break; + /* curren 164 currency sign, U+00A4 ISOnum */ + case 1584829677: + if (len == 6 /* && strncmp(ent, "curren") == 0 */) { + return 164; + } + break; + /* yen 165 yen sign = yuan sign, U+00A5 ISOnum */ + case 3581281881: + if (len == 3 /* && strncmp(ent, "yen") == 0 */) { + return 165; + } + break; + /* brvbar 166 broken bar = broken vertical bar, */ + case 3768851825: + if (len == 6 /* && strncmp(ent, "brvbar") == 0 */) { + return 166; + } + break; + /* sect 167 section sign, U+00A7 ISOnum */ + case 2614630987: + if (len == 4 /* && strncmp(ent, "sect") == 0 */) { + return 167; + } + break; + /* uml 168 diaeresis = spacing diaeresis, */ + case 2036319259: + if (len == 3 /* && strncmp(ent, "uml") == 0 */) { + return 168; + } + break; + /* copy 169 copyright sign, U+00A9 ISOnum */ + case 2428845635: + if (len == 4 /* && strncmp(ent, "copy") == 0 */) { + return 169; + } + break; + /* ordf 170 feminine ordinal indicator, U+00AA ISOnum */ + case 212470411: + if (len == 4 /* && strncmp(ent, "ordf") == 0 */) { + return 170; + } + break; + /* laquo 171 left-pointing double angle quotation mark */ + case 401220509: + if (len == 5 /* && strncmp(ent, "laquo") == 0 */) { + return 171; + } + break; + /* not 172 not sign, U+00AC ISOnum */ + case 3607627678: + if (len == 3 /* && strncmp(ent, "not") == 0 */) { + return 172; + } + break; + /* shy 173 soft hyphen = discretionary hyphen, */ + case 1248857237: + if (len == 3 /* && strncmp(ent, "shy") == 0 */) { + return 173; + } + break; + /* reg 174 registered sign = registered trade mark sign, */ + case 854293939: + if (len == 3 /* && strncmp(ent, "reg") == 0 */) { + return 174; + } + break; + /* macr 175 macron = spacing macron = overline */ + case 591423527: + if (len == 4 /* && strncmp(ent, "macr") == 0 */) { + return 175; + } + break; + /* deg 176 degree sign, U+00B0 ISOnum */ + case 3990252661: + if (len == 3 /* && strncmp(ent, "deg") == 0 */) { + return 176; + } + break; + /* plusmn 177 plus-minus sign = plus-or-minus sign, */ + case 3641444957: + if (len == 6 /* && strncmp(ent, "plusmn") == 0 */) { + return 177; + } + break; + /* sup2 178 superscript two = superscript digit two */ + case 279450434: + if (len == 4 /* && strncmp(ent, "sup2") == 0 */) { + return 178; + } + break; + /* sup3 179 superscript three = superscript digit three */ + case 279450435: + if (len == 4 /* && strncmp(ent, "sup3") == 0 */) { + return 179; + } + break; + /* acute 180 acute accent = spacing acute, */ + case 1795641881: + if (len == 5 /* && strncmp(ent, "acute") == 0 */) { + return 180; + } + break; + /* micro 181 micro sign, U+00B5 ISOnum */ + case 1447763057: + if (len == 5 /* && strncmp(ent, "micro") == 0 */) { + return 181; + } + break; + /* para 182 pilcrow sign = paragraph sign, */ + case 848855704: + if (len == 4 /* && strncmp(ent, "para") == 0 */) { + return 182; + } + break; + /* middot 183 middle dot = Georgian comma */ + case 3167839463: + if (len == 6 /* && strncmp(ent, "middot") == 0 */) { + return 183; + } + break; + /* cedil 184 cedilla = spacing cedilla, U+00B8 ISOdia */ + case 1354214564: + if (len == 5 /* && strncmp(ent, "cedil") == 0 */) { + return 184; + } + break; + /* sup1 185 superscript one = superscript digit one, */ + case 279450433: + if (len == 4 /* && strncmp(ent, "sup1") == 0 */) { + return 185; + } + break; + /* ordm 186 masculine ordinal indicator, */ + case 212470418: + if (len == 4 /* && strncmp(ent, "ordm") == 0 */) { + return 186; + } + break; + /* raquo 187 right-pointing double angle quotation mark */ + case 1355124995: + if (len == 5 /* && strncmp(ent, "raquo") == 0 */) { + return 187; + } + break; + /* frac14 188 vulgar fraction one quarter */ + case 1016175271: + if (len == 6 /* && strncmp(ent, "frac14") == 0 */) { + return 188; + } + break; + /* frac12 189 vulgar fraction one half */ + case 1016175269: + if (len == 6 /* && strncmp(ent, "frac12") == 0 */) { + return 189; + } + break; + /* frac34 190 vulgar fraction three quarters */ + case 1019504321: + if (len == 6 /* && strncmp(ent, "frac34") == 0 */) { + return 190; + } + break; + /* iquest 191 inverted question mark */ + case 430057661: + if (len == 6 /* && strncmp(ent, "iquest") == 0 */) { + return 191; + } + break; + /* Agrave 192 latin capital letter A with grave */ + case 2815520320: + if (len == 6 /* && strncmp(ent, "Agrave") == 0 */) { + return 192; + } + break; + /* Aacute 193 latin capital letter A with acute, */ + case 4192391993: + if (len == 6 /* && strncmp(ent, "Aacute") == 0 */) { + return 193; + } + break; + /* Acirc 194 latin capital letter A with circumflex, */ + case 619920369: + if (len == 5 /* && strncmp(ent, "Acirc") == 0 */) { + return 194; + } + break; + /* Atilde 195 latin capital letter A with tilde, */ + case 4145258425: + if (len == 6 /* && strncmp(ent, "Atilde") == 0 */) { + return 195; + } + break; + /* Auml 196 latin capital letter A with diaeresis, */ + case 3558330427: + if (len == 4 /* && strncmp(ent, "Auml") == 0 */) { + return 196; + } + break; + /* Aring 197 latin capital letter A with ring above */ + case 1775583868: + if (len == 5 /* && strncmp(ent, "Aring") == 0 */) { + return 197; + } + break; + /* AElig 198 latin capital letter AE */ + case 3743972869: + if (len == 5 /* && strncmp(ent, "AElig") == 0 */) { + return 198; + } + break; + /* Ccedil 199 latin capital letter C with cedilla, */ + case 885931646: + if (len == 6 /* && strncmp(ent, "Ccedil") == 0 */) { + return 199; + } + break; + /* Egrave 200 latin capital letter E with grave, */ + case 1380421556: + if (len == 6 /* && strncmp(ent, "Egrave") == 0 */) { + return 200; + } + break; + /* Eacute 201 latin capital letter E with acute, */ + case 2757293229: + if (len == 6 /* && strncmp(ent, "Eacute") == 0 */) { + return 201; + } + break; + /* Ecirc 202 latin capital letter E with circumflex, */ + case 1255856693: + if (len == 5 /* && strncmp(ent, "Ecirc") == 0 */) { + return 202; + } + break; + /* Euml 203 latin capital letter E with diaeresis, */ + case 2436627087: + if (len == 4 /* && strncmp(ent, "Euml") == 0 */) { + return 203; + } + break; + /* Igrave 204 latin capital letter I with grave, */ + case 4240290088: + if (len == 6 /* && strncmp(ent, "Igrave") == 0 */) { + return 204; + } + break; + /* Iacute 205 latin capital letter I with acute, */ + case 1322194465: + if (len == 6 /* && strncmp(ent, "Iacute") == 0 */) { + return 205; + } + break; + /* Icirc 206 latin capital letter I with circumflex, */ + case 1891793017: + if (len == 5 /* && strncmp(ent, "Icirc") == 0 */) { + return 206; + } + break; + /* Iuml 207 latin capital letter I with diaeresis, */ + case 1314923747: + if (len == 4 /* && strncmp(ent, "Iuml") == 0 */) { + return 207; + } + break; + /* ETH 208 latin capital letter ETH, U+00D0 ISOlat1 */ + case 475229442: + if (len == 3 /* && strncmp(ent, "ETH") == 0 */) { + return 208; + } + break; + /* Ntilde 209 latin capital letter N with tilde, */ + case 2702412914: + if (len == 6 /* && strncmp(ent, "Ntilde") == 0 */) { + return 209; + } + break; + /* Ograve 210 latin capital letter O with grave, */ + case 4235125590: + if (len == 6 /* && strncmp(ent, "Ograve") == 0 */) { + return 210; + } + break; + /* Oacute 211 latin capital letter O with acute, */ + case 1317029967: + if (len == 6 /* && strncmp(ent, "Oacute") == 0 */) { + return 211; + } + break; + /* Ocirc 212 latin capital letter O with circumflex, */ + case 2845697503: + if (len == 5 /* && strncmp(ent, "Ocirc") == 0 */) { + return 212; + } + break; + /* Otilde 213 latin capital letter O with tilde, */ + case 1269896399: + if (len == 6 /* && strncmp(ent, "Otilde") == 0 */) { + return 213; + } + break; + /* Ouml 214 latin capital letter O with diaeresis, */ + case 1779852385: + if (len == 4 /* && strncmp(ent, "Ouml") == 0 */) { + return 214; + } + break; + /* times 215 multiplication sign, U+00D7 ISOnum */ + case 2139742557: + if (len == 5 /* && strncmp(ent, "times") == 0 */) { + return 215; + } + break; + /* Oslash 216 latin capital letter O with stroke */ + case 1378045056: + if (len == 6 /* && strncmp(ent, "Oslash") == 0 */) { + return 216; + } + break; + /* Ugrave 217 latin capital letter U with grave, */ + case 4229961092: + if (len == 6 /* && strncmp(ent, "Ugrave") == 0 */) { + return 217; + } + break; + /* Uacute 218 latin capital letter U with acute, */ + case 1311865469: + if (len == 6 /* && strncmp(ent, "Uacute") == 0 */) { + return 218; + } + break; + /* Ucirc 219 latin capital letter U with circumflex, */ + case 3799601989: + if (len == 5 /* && strncmp(ent, "Ucirc") == 0 */) { + return 219; + } + break; + /* Uuml 220 latin capital letter U with diaeresis, */ + case 2244781023: + if (len == 4 /* && strncmp(ent, "Uuml") == 0 */) { + return 220; + } + break; + /* Yacute 221 latin capital letter Y with acute, */ + case 4171734001: + if (len == 6 /* && strncmp(ent, "Yacute") == 0 */) { + return 221; + } + break; + /* THORN 222 latin capital letter THORN, */ + case 4251263774: + if (len == 5 /* && strncmp(ent, "THORN") == 0 */) { + return 222; + } + break; + /* szlig 223 latin small letter sharp s = ess-zed, */ + case 51833136: + if (len == 5 /* && strncmp(ent, "szlig") == 0 */) { + return 223; + } + break; + /* agrave 224 latin small letter a with grave */ + case 4219632096: + if (len == 6 /* && strncmp(ent, "agrave") == 0 */) { + return 224; + } + break; + /* aacute 225 latin small letter a with acute, */ + case 1301536473: + if (len == 6 /* && strncmp(ent, "aacute") == 0 */) { + return 225; + } + break; + /* acirc 226 latin small letter a with circumflex, */ + case 1412443665: + if (len == 5 /* && strncmp(ent, "acirc") == 0 */) { + return 226; + } + break; + /* atilde 227 latin small letter a with tilde, */ + case 1254402905: + if (len == 6 /* && strncmp(ent, "atilde") == 0 */) { + return 227; + } + break; + /* auml 228 latin small letter a with diaeresis, */ + case 3174638299: + if (len == 4 /* && strncmp(ent, "auml") == 0 */) { + return 228; + } + break; + /* aring 229 latin small letter a with ring above */ + case 2568107164: + if (len == 5 /* && strncmp(ent, "aring") == 0 */) { + return 229; + } + break; + /* aelig 230 latin small letter ae */ + case 4152804037: + if (len == 5 /* && strncmp(ent, "aelig") == 0 */) { + return 230; + } + break; + /* ccedil 231 latin small letter c with cedilla, */ + case 2290043422: + if (len == 6 /* && strncmp(ent, "ccedil") == 0 */) { + return 231; + } + break; + /* egrave 232 latin small letter e with grave, */ + case 2784533332: + if (len == 6 /* && strncmp(ent, "egrave") == 0 */) { + return 232; + } + break; + /* eacute 233 latin small letter e with acute, */ + case 4161405005: + if (len == 6 /* && strncmp(ent, "eacute") == 0 */) { + return 233; + } + break; + /* ecirc 234 latin small letter e with circumflex, */ + case 2048379989: + if (len == 5 /* && strncmp(ent, "ecirc") == 0 */) { + return 234; + } + break; + /* euml 235 latin small letter e with diaeresis, */ + case 2052934959: + if (len == 4 /* && strncmp(ent, "euml") == 0 */) { + return 235; + } + break; + /* igrave 236 latin small letter i with grave, */ + case 1349434568: + if (len == 6 /* && strncmp(ent, "igrave") == 0 */) { + return 236; + } + break; + /* iacute 237 latin small letter i with acute, */ + case 2726306241: + if (len == 6 /* && strncmp(ent, "iacute") == 0 */) { + return 237; + } + break; + /* icirc 238 latin small letter i with circumflex, */ + case 2684316313: + if (len == 5 /* && strncmp(ent, "icirc") == 0 */) { + return 238; + } + break; + /* iuml 239 latin small letter i with diaeresis, */ + case 931231619: + if (len == 4 /* && strncmp(ent, "iuml") == 0 */) { + return 239; + } + break; + /* eth 240 latin small letter eth, U+00F0 ISOlat1 */ + case 109822946: + if (len == 3 /* && strncmp(ent, "eth") == 0 */) { + return 240; + } + break; + /* ntilde 241 latin small letter n with tilde, */ + case 4106524690: + if (len == 6 /* && strncmp(ent, "ntilde") == 0 */) { + return 241; + } + break; + /* ograve 242 latin small letter o with grave, */ + case 1344270070: + if (len == 6 /* && strncmp(ent, "ograve") == 0 */) { + return 242; + } + break; + /* oacute 243 latin small letter o with acute, */ + case 2721141743: + if (len == 6 /* && strncmp(ent, "oacute") == 0 */) { + return 243; + } + break; + /* ocirc 244 latin small letter o with circumflex, */ + case 3638220799: + if (len == 5 /* && strncmp(ent, "ocirc") == 0 */) { + return 244; + } + break; + /* otilde 245 latin small letter o with tilde, */ + case 2674008175: + if (len == 6 /* && strncmp(ent, "otilde") == 0 */) { + return 245; + } + break; + /* ouml 246 latin small letter o with diaeresis, */ + case 1396160257: + if (len == 4 /* && strncmp(ent, "ouml") == 0 */) { + return 246; + } + break; + /* divide 247 division sign, U+00F7 ISOnum */ + case 2204943563: + if (len == 6 /* && strncmp(ent, "divide") == 0 */) { + return 247; + } + break; + /* oslash 248 latin small letter o with stroke, */ + case 2782156832: + if (len == 6 /* && strncmp(ent, "oslash") == 0 */) { + return 248; + } + break; + /* ugrave 249 latin small letter u with grave, */ + case 1339105572: + if (len == 6 /* && strncmp(ent, "ugrave") == 0 */) { + return 249; + } + break; + /* uacute 250 latin small letter u with acute, */ + case 2715977245: + if (len == 6 /* && strncmp(ent, "uacute") == 0 */) { + return 250; + } + break; + /* ucirc 251 latin small letter u with circumflex, */ + case 297157989: + if (len == 5 /* && strncmp(ent, "ucirc") == 0 */) { + return 251; + } + break; + /* uuml 252 latin small letter u with diaeresis, */ + case 1861088895: + if (len == 4 /* && strncmp(ent, "uuml") == 0 */) { + return 252; + } + break; + /* yacute 253 latin small letter y with acute, */ + case 1280878481: + if (len == 6 /* && strncmp(ent, "yacute") == 0 */) { + return 253; + } + break; + /* thorn 254 latin small letter thorn with, */ + case 4294688446: + if (len == 5 /* && strncmp(ent, "thorn") == 0 */) { + return 254; + } + break; + /* yuml 255 latin small letter y with diaeresis, */ + case 739385555: + if (len == 4 /* && strncmp(ent, "yuml") == 0 */) { + return 255; + } + break; + /* fnof 402 latin small f with hook = function */ + case 2270075705: + if (len == 4 /* && strncmp(ent, "fnof") == 0 */) { + return 402; + } + break; + /* Alpha 913 greek capital letter alpha, U+0391 */ + case 4027656009: + if (len == 5 /* && strncmp(ent, "Alpha") == 0 */) { + return 913; + } + break; + /* Beta 914 greek capital letter beta, U+0392 */ + case 277666448: + if (len == 4 /* && strncmp(ent, "Beta") == 0 */) { + return 914; + } + break; + /* Gamma 915 greek capital letter gamma, */ + case 1537149070: + if (len == 5 /* && strncmp(ent, "Gamma") == 0 */) { + return 915; + } + break; + /* Delta 916 greek capital letter delta, */ + case 3855542753: + if (len == 5 /* && strncmp(ent, "Delta") == 0 */) { + return 916; + } + break; + /* Epsilon 917 greek capital letter epsilon, U+0395 */ + case 2449300823: + if (len == 7 /* && strncmp(ent, "Epsilon") == 0 */) { + return 917; + } + break; + /* Zeta 918 greek capital letter zeta, U+0396 */ + case 2137381000: + if (len == 4 /* && strncmp(ent, "Zeta") == 0 */) { + return 918; + } + break; + /* Eta 919 greek capital letter eta, U+0397 */ + case 528494267: + if (len == 3 /* && strncmp(ent, "Eta") == 0 */) { + return 919; + } + break; + /* Theta 920 greek capital letter theta, */ + case 3904764433: + if (len == 5 /* && strncmp(ent, "Theta") == 0 */) { + return 920; + } + break; + /* Iota 921 greek capital letter iota, U+0399 */ + case 3284124477: + if (len == 4 /* && strncmp(ent, "Iota") == 0 */) { + return 921; + } + break; + /* Kappa 922 greek capital letter kappa, U+039A */ + case 3346788084: + if (len == 5 /* && strncmp(ent, "Kappa") == 0 */) { + return 922; + } + break; + /* Lambda 923 greek capital letter lambda, */ + case 1824315307: + if (len == 6 /* && strncmp(ent, "Lambda") == 0 */) { + return 923; + } + break; + /* Mu 924 greek capital letter mu, U+039C */ + case 1324604304: + if (len == 2 /* && strncmp(ent, "Mu") == 0 */) { + return 924; + } + break; + /* Nu 925 greek capital letter nu, U+039D */ + case 1326268829: + if (len == 2 /* && strncmp(ent, "Nu") == 0 */) { + return 925; + } + break; + /* Xi 926 greek capital letter xi, U+039E ISOgrk3 */ + case 1342914067: + if (len == 2 /* && strncmp(ent, "Xi") == 0 */) { + return 926; + } + break; + /* Omicron 927 greek capital letter omicron, U+039F */ + case 488730696: + if (len == 7 /* && strncmp(ent, "Omicron") == 0 */) { + return 927; + } + break; + /* Pi 928 greek capital letter pi, U+03A0 ISOgrk3 */ + case 1329597867: + if (len == 2 /* && strncmp(ent, "Pi") == 0 */) { + return 928; + } + break; + /* Rho 929 greek capital letter rho, U+03A1 */ + case 1277958850: + if (len == 3 /* && strncmp(ent, "Rho") == 0 */) { + return 929; + } + break; + /* Sigma 931 greek capital letter sigma, */ + case 3159100428: + if (len == 5 /* && strncmp(ent, "Sigma") == 0 */) { + return 931; + } + break; + /* Tau 932 greek capital letter tau, U+03A4 */ + case 2045446591: + if (len == 3 /* && strncmp(ent, "Tau") == 0 */) { + return 932; + } + break; + /* Upsilon 933 greek capital letter upsilon, */ + case 2291992807: + if (len == 7 /* && strncmp(ent, "Upsilon") == 0 */) { + return 933; + } + break; + /* Phi 934 greek capital letter phi, */ + case 498819434: + if (len == 3 /* && strncmp(ent, "Phi") == 0 */) { + return 934; + } + break; + /* Chi 935 greek capital letter chi, U+03A7 */ + case 4024347861: + if (len == 3 /* && strncmp(ent, "Chi") == 0 */) { + return 935; + } + break; + /* Psi 936 greek capital letter psi, */ + case 517129209: + if (len == 3 /* && strncmp(ent, "Psi") == 0 */) { + return 936; + } + break; + /* Omega 937 greek capital letter omega, */ + case 612334204: + if (len == 5 /* && strncmp(ent, "Omega") == 0 */) { + return 937; + } + break; + /* alpha 945 greek small letter alpha, */ + case 525212009: + if (len == 5 /* && strncmp(ent, "alpha") == 0 */) { + return 945; + } + break; + /* beta 946 greek small letter beta, U+03B2 ISOgrk3 */ + case 4188941616: + if (len == 4 /* && strncmp(ent, "beta") == 0 */) { + return 946; + } + break; + /* gamma 947 greek small letter gamma, */ + case 2329672366: + if (len == 5 /* && strncmp(ent, "gamma") == 0 */) { + return 947; + } + break; + /* delta 948 greek small letter delta, */ + case 353098753: + if (len == 5 /* && strncmp(ent, "delta") == 0 */) { + return 948; + } + break; + /* epsilon 949 greek small letter epsilon, */ + case 2134684791: + if (len == 7 /* && strncmp(ent, "epsilon") == 0 */) { + return 949; + } + break; + /* zeta 950 greek small letter zeta, U+03B6 ISOgrk3 */ + case 1753688872: + if (len == 4 /* && strncmp(ent, "zeta") == 0 */) { + return 950; + } + break; + /* eta 951 greek small letter eta, U+03B7 ISOgrk3 */ + case 109822939: + if (len == 3 /* && strncmp(ent, "eta") == 0 */) { + return 951; + } + break; + /* theta 952 greek small letter theta, */ + case 402320433: + if (len == 5 /* && strncmp(ent, "theta") == 0 */) { + return 952; + } + break; + /* iota 953 greek small letter iota, U+03B9 ISOgrk3 */ + case 2900432349: + if (len == 4 /* && strncmp(ent, "iota") == 0 */) { + return 953; + } + break; + /* kappa 954 greek small letter kappa, */ + case 4139311380: + if (len == 5 /* && strncmp(ent, "kappa") == 0 */) { + return 954; + } + break; + /* lambda 955 greek small letter lambda, */ + case 3228427083: + if (len == 6 /* && strncmp(ent, "lambda") == 0 */) { + return 955; + } + break; + /* mu 956 greek small letter mu, U+03BC ISOgrk3 */ + case 1377869104: + if (len == 2 /* && strncmp(ent, "mu") == 0 */) { + return 956; + } + break; + /* nu 957 greek small letter nu, U+03BD ISOgrk3 */ + case 1379533629: + if (len == 2 /* && strncmp(ent, "nu") == 0 */) { + return 957; + } + break; + /* xi 958 greek small letter xi, U+03BE ISOgrk3 */ + case 1396178867: + if (len == 2 /* && strncmp(ent, "xi") == 0 */) { + return 958; + } + break; + /* omicron 959 greek small letter omicron, U+03BF NEW */ + case 174114664: + if (len == 7 /* && strncmp(ent, "omicron") == 0 */) { + return 959; + } + break; + /* pi 960 greek small letter pi, U+03C0 ISOgrk3 */ + case 1382862667: + if (len == 2 /* && strncmp(ent, "pi") == 0 */) { + return 960; + } + break; + /* rho 961 greek small letter rho, U+03C1 ISOgrk3 */ + case 859287522: + if (len == 3 /* && strncmp(ent, "rho") == 0 */) { + return 961; + } + break; + /* sigmaf 962 greek small letter final sigma, */ + case 2582995969: + if (len == 6 /* && strncmp(ent, "sigmaf") == 0 */) { + return 962; + } + break; + /* sigma 963 greek small letter sigma, */ + case 3951623724: + if (len == 5 /* && strncmp(ent, "sigma") == 0 */) { + return 963; + } + break; + /* tau 964 greek small letter tau, U+03C4 ISOgrk3 */ + case 1626775263: + if (len == 3 /* && strncmp(ent, "tau") == 0 */) { + return 964; + } + break; + /* upsilon 965 greek small letter upsilon, */ + case 1977376775: + if (len == 7 /* && strncmp(ent, "upsilon") == 0 */) { + return 965; + } + break; + /* phi 966 greek small letter phi, U+03C6 ISOgrk3 */ + case 80148106: + if (len == 3 /* && strncmp(ent, "phi") == 0 */) { + return 966; + } + break; + /* chi 967 greek small letter chi, U+03C7 ISOgrk3 */ + case 3605676533: + if (len == 3 /* && strncmp(ent, "chi") == 0 */) { + return 967; + } + break; + /* psi 968 greek small letter psi, U+03C8 ISOgrk3 */ + case 98457881: + if (len == 3 /* && strncmp(ent, "psi") == 0 */) { + return 968; + } + break; + /* omega 969 greek small letter omega, */ + case 1404857500: + if (len == 5 /* && strncmp(ent, "omega") == 0 */) { + return 969; + } + break; + /* thetasym 977 greek small letter theta symbol, */ + case 3881711083: + if (len == 8 /* && strncmp(ent, "thetasym") == 0 */) { + return 977; + } + break; + /* upsih 978 greek upsilon with hook symbol, */ + case 3753563936: + if (len == 5 /* && strncmp(ent, "upsih") == 0 */) { + return 978; + } + break; + /* piv 982 greek pi symbol, U+03D6 ISOgrk3 */ + case 81812644: + if (len == 3 /* && strncmp(ent, "piv") == 0 */) { + return 982; + } + break; + /* bull 8226 bullet = black small circle, */ + case 1818806115: + if (len == 4 /* && strncmp(ent, "bull") == 0 */) { + return 8226; + } + break; + /* hellip 8230 horizontal ellipsis = three dot leader, */ + case 1967714928: + if (len == 6 /* && strncmp(ent, "hellip") == 0 */) { + return 8230; + } + break; + /* prime 8242 prime = minutes = feet, U+2032 ISOtech */ + case 656236556: + if (len == 5 /* && strncmp(ent, "prime") == 0 */) { + return 8242; + } + break; + /* Prime 8243 double prime = seconds = inches, */ + case 4158680556: + if (len == 5 /* && strncmp(ent, "Prime") == 0 */) { + return 8243; + } + break; + /* oline 8254 overline = spacing overscore, */ + case 33988362: + if (len == 5 /* && strncmp(ent, "oline") == 0 */) { + return 8254; + } + break; + /* frasl 8260 fraction slash, U+2044 NEW */ + case 254792559: + if (len == 5 /* && strncmp(ent, "frasl") == 0 */) { + return 8260; + } + break; + /* weierp 8472 script capital P = power set */ + case 3305299450: + if (len == 6 /* && strncmp(ent, "weierp") == 0 */) { + return 8472; + } + break; + /* image 8465 blackletter capital I = imaginary part, */ + case 3187641494: + if (len == 5 /* && strncmp(ent, "image") == 0 */) { + return 8465; + } + break; + /* real 8476 blackletter capital R = real part symbol, */ + case 3965469588: + if (len == 4 /* && strncmp(ent, "real") == 0 */) { + return 8476; + } + break; + /* trade 8482 trade mark sign, U+2122 ISOnum */ + case 2455601811: + if (len == 5 /* && strncmp(ent, "trade") == 0 */) { + return 8482; + } + break; + /* alefsym 8501 alef symbol = first transfinite cardinal, */ + case 3894502290: + if (len == 7 /* && strncmp(ent, "alefsym") == 0 */) { + return 8501; + } + break; + /* larr 8592 leftwards arrow, U+2190 ISOnum */ + case 1970559061: + if (len == 4 /* && strncmp(ent, "larr") == 0 */) { + return 8592; + } + break; + /* uarr 8593 upwards arrow, U+2191 ISOnum */ + case 2667952018: + if (len == 4 /* && strncmp(ent, "uarr") == 0 */) { + return 8593; + } + break; + /* rarr 8594 rightwards arrow, U+2192 ISOnum */ + case 2435487699: + if (len == 4 /* && strncmp(ent, "rarr") == 0 */) { + return 8594; + } + break; + /* darr 8595 downwards arrow, U+2193 ISOnum */ + case 4213965741: + if (len == 4 /* && strncmp(ent, "darr") == 0 */) { + return 8595; + } + break; + /* harr 8596 left right arrow, U+2194 ISOamsa */ + case 3092262401: + if (len == 4 /* && strncmp(ent, "harr") == 0 */) { + return 8596; + } + break; + /* crarr 8629 downwards arrow with corner leftwards */ + case 4071143093: + if (len == 5 /* && strncmp(ent, "crarr") == 0 */) { + return 8629; + } + break; + /* lArr 8656 leftwards double arrow, U+21D0 ISOtech */ + case 2389230389: + if (len == 4 /* && strncmp(ent, "lArr") == 0 */) { + return 8656; + } + break; + /* uArr 8657 upwards double arrow, U+21D1 ISOamsa */ + case 3086623346: + if (len == 4 /* && strncmp(ent, "uArr") == 0 */) { + return 8657; + } + break; + /* rArr 8658 rightwards double arrow, */ + case 2854159027: + if (len == 4 /* && strncmp(ent, "rArr") == 0 */) { + return 8658; + } + break; + /* dArr 8659 downwards double arrow, U+21D3 ISOamsa */ + case 337669773: + if (len == 4 /* && strncmp(ent, "dArr") == 0 */) { + return 8659; + } + break; + /* hArr 8660 left right double arrow, */ + case 3510933729: + if (len == 4 /* && strncmp(ent, "hArr") == 0 */) { + return 8660; + } + break; + /* forall 8704 for all, U+2200 ISOtech */ + case 2607244222: + if (len == 6 /* && strncmp(ent, "forall") == 0 */) { + return 8704; + } + break; + /* part 8706 partial differential, U+2202 ISOtech */ + case 848855723: + if (len == 4 /* && strncmp(ent, "part") == 0 */) { + return 8706; + } + break; + /* exist 8707 there exists, U+2203 ISOtech */ + case 3677294764: + if (len == 5 /* && strncmp(ent, "exist") == 0 */) { + return 8707; + } + break; + /* empty 8709 empty set = null set = diameter, */ + case 4121922294: + if (len == 5 /* && strncmp(ent, "empty") == 0 */) { + return 8709; + } + break; + /* nabla 8711 nabla = backward difference, */ + case 3450596949: + if (len == 5 /* && strncmp(ent, "nabla") == 0 */) { + return 8711; + } + break; + /* isin 8712 element of, U+2208 ISOtech */ + case 145434111: + if (len == 4 /* && strncmp(ent, "isin") == 0 */) { + return 8712; + } + break; + /* notin 8713 not an element of, U+2209 ISOtech */ + case 89445443: + if (len == 5 /* && strncmp(ent, "notin") == 0 */) { + return 8713; + } + break; + /* ni 8715 contains as member, U+220B ISOtech */ + case 1379533617: + if (len == 2 /* && strncmp(ent, "ni") == 0 */) { + return 8715; + } + break; + /* prod 8719 n-ary product = product sign, */ + case 3171579821: + if (len == 4 /* && strncmp(ent, "prod") == 0 */) { + return 8719; + } + break; + /* sum 8721 n-ary sumation, U+2211 ISOamsb */ + case 1270496050: + if (len == 3 /* && strncmp(ent, "sum") == 0 */) { + return 8721; + } + break; + /* minus 8722 minus sign, U+2212 ISOtech */ + case 1443056095: + if (len == 5 /* && strncmp(ent, "minus") == 0 */) { + return 8722; + } + break; + /* lowast 8727 asterisk operator, U+2217 ISOtech */ + case 137860408: + if (len == 6 /* && strncmp(ent, "lowast") == 0 */) { + return 8727; + } + break; + /* radic 8730 square root = radical sign, */ + case 565711814: + if (len == 5 /* && strncmp(ent, "radic") == 0 */) { + return 8730; + } + break; + /* prop 8733 proportional to, U+221D ISOtech */ + case 3171579833: + if (len == 4 /* && strncmp(ent, "prop") == 0 */) { + return 8733; + } + break; + /* infin 8734 infinity, U+221E ISOtech */ + case 3784651419: + if (len == 5 /* && strncmp(ent, "infin") == 0 */) { + return 8734; + } + break; + /* ang 8736 angle, U+2220 ISOamso */ + case 2836524271: + if (len == 3 /* && strncmp(ent, "ang") == 0 */) { + return 8736; + } + break; + /* and 8743 logical and = wedge, U+2227 ISOtech */ + case 2836524268: + if (len == 3 /* && strncmp(ent, "and") == 0 */) { + return 8743; + } + break; + /* or 8744 logical or = vee, U+2228 ISOtech */ + case 1381198151: + if (len == 2 /* && strncmp(ent, "or") == 0 */) { + return 8744; + } + break; + /* cap 8745 intersection = cap, U+2229 ISOtech */ + case 3594024865: + if (len == 3 /* && strncmp(ent, "cap") == 0 */) { + return 8745; + } + break; + /* cup 8746 union = cup, U+222A ISOtech */ + case 3627315365: + if (len == 3 /* && strncmp(ent, "cup") == 0 */) { + return 8746; + } + break; + /* int 8747 integral, U+222B ISOtech */ + case 1658114628: + if (len == 3 /* && strncmp(ent, "int") == 0 */) { + return 8747; + } + break; + /* there4 8756 therefore, U+2234 ISOtech */ + case 1359369970: + if (len == 6 /* && strncmp(ent, "there4") == 0 */) { + return 8756; + } + break; + /* sim 8764 tilde operator = varies with = similar to, */ + case 1250521750: + if (len == 3 /* && strncmp(ent, "sim") == 0 */) { + return 8764; + } + break; + /* cong 8773 approximately equal to, U+2245 ISOtech */ + case 2425516567: + if (len == 4 /* && strncmp(ent, "cong") == 0 */) { + return 8773; + } + break; + /* asymp 8776 almost equal to = asymptotic to, */ + case 3150422973: + if (len == 5 /* && strncmp(ent, "asymp") == 0 */) { + return 8776; + } + break; + /* ne 8800 not equal to, U+2260 ISOtech */ + case 1379533613: + if (len == 2 /* && strncmp(ent, "ne") == 0 */) { + return 8800; + } + break; + /* equiv 8801 identical to, U+2261 ISOtech */ + case 634790405: + if (len == 5 /* && strncmp(ent, "equiv") == 0 */) { + return 8801; + } + break; + /* le 8804 less-than or equal to, U+2264 ISOtech */ + case 1376204563: + if (len == 2 /* && strncmp(ent, "le") == 0 */) { + return 8804; + } + break; + /* ge 8805 greater-than or equal to, */ + case 1367881938: + if (len == 2 /* && strncmp(ent, "ge") == 0 */) { + return 8805; + } + break; + /* sub 8834 subset of, U+2282 ISOtech */ + case 1270496039: + if (len == 3 /* && strncmp(ent, "sub") == 0 */) { + return 8834; + } + break; + /* sup 8835 superset of, U+2283 ISOtech */ + case 1270496053: + if (len == 3 /* && strncmp(ent, "sup") == 0 */) { + return 8835; + } + break; + /* nsub 8836 not a subset of, U+2284 ISOamsn */ + case 1984504696: + if (len == 4 /* && strncmp(ent, "nsub") == 0 */) { + return 8836; + } + break; + /* sube 8838 subset of or equal to, U+2286 ISOtech */ + case 256147135: + if (len == 4 /* && strncmp(ent, "sube") == 0 */) { + return 8838; + } + break; + /* supe 8839 superset of or equal to, */ + case 279450485: + if (len == 4 /* && strncmp(ent, "supe") == 0 */) { + return 8839; + } + break; + /* oplus 8853 circled plus = direct sum, */ + case 92645826: + if (len == 5 /* && strncmp(ent, "oplus") == 0 */) { + return 8853; + } + break; + /* otimes 8855 circled times = vector product, */ + case 3065242419: + if (len == 6 /* && strncmp(ent, "otimes") == 0 */) { + return 8855; + } + break; + /* perp 8869 up tack = orthogonal to = perpendicular, */ + case 2407134539: + if (len == 4 /* && strncmp(ent, "perp") == 0 */) { + return 8869; + } + break; + /* sdot 8901 dot operator, U+22C5 ISOamsb */ + case 2245035582: + if (len == 4 /* && strncmp(ent, "sdot") == 0 */) { + return 8901; + } + break; + /* lceil 8968 left ceiling = apl upstile, */ + case 1588009020: + if (len == 5 /* && strncmp(ent, "lceil") == 0 */) { + return 8968; + } + break; + /* rceil 8969 right ceiling, U+2309 ISOamsc */ + case 2541913506: + if (len == 5 /* && strncmp(ent, "rceil") == 0 */) { + return 8969; + } + break; + /* lfloor 8970 left floor = apl downstile, */ + case 1870296512: + if (len == 6 /* && strncmp(ent, "lfloor") == 0 */) { + return 8970; + } + break; + /* rfloor 8971 right floor, U+230B ISOamsc */ + case 1865132014: + if (len == 6 /* && strncmp(ent, "rfloor") == 0 */) { + return 8971; + } + break; + /* lang 9001 left-pointing angle bracket = bra, */ + case 1963900950: + if (len == 4 /* && strncmp(ent, "lang") == 0 */) { + return 9001; + } + break; + /* rang 9002 right-pointing angle bracket = ket, */ + case 2428829588: + if (len == 4 /* && strncmp(ent, "rang") == 0 */) { + return 9002; + } + break; + /* loz 9674 lozenge, U+25CA ISOpub */ + case 2828488274: + if (len == 3 /* && strncmp(ent, "loz") == 0 */) { + return 9674; + } + break; + /* spades 9824 black spade suit, U+2660 ISOpub */ + case 4026453962: + if (len == 6 /* && strncmp(ent, "spades") == 0 */) { + return 9824; + } + break; + /* clubs 9827 black club suit = shamrock, */ + case 2781041564: + if (len == 5 /* && strncmp(ent, "clubs") == 0 */) { + return 9827; + } + break; + /* hearts 9829 black heart suit = valentine, */ + case 2039418001: + if (len == 6 /* && strncmp(ent, "hearts") == 0 */) { + return 9829; + } + break; + /* diams 9830 black diamond suit, U+2666 ISOpub */ + case 3524411593: + if (len == 5 /* && strncmp(ent, "diams") == 0 */) { + return 9830; + } + break; + /* quot 34 quotation mark = APL quote, */ + case 2986121293: + if (len == 4 /* && strncmp(ent, "quot") == 0 */) { + return 34; + } + break; + /* amp 38 ampersand, U+0026 ISOnum */ + case 2834859755: + if (len == 3 /* && strncmp(ent, "amp") == 0 */) { + return 38; + } + break; + /* lt 60 less-than sign, U+003C ISOnum */ + case 1376204578: + if (len == 2 /* && strncmp(ent, "lt") == 0 */) { + return 60; + } + break; + /* gt 62 greater-than sign, U+003E ISOnum */ + case 1367881953: + if (len == 2 /* && strncmp(ent, "gt") == 0 */) { + return 62; + } + break; + /* OElig 338 latin capital ligature OE, */ + case 1674782707: + if (len == 5 /* && strncmp(ent, "OElig") == 0 */) { + return 338; + } + break; + /* oelig 339 latin small ligature oe, U+0153 ISOlat2 */ + case 2083613875: + if (len == 5 /* && strncmp(ent, "oelig") == 0 */) { + return 339; + } + break; + /* Scaron 352 latin capital letter S with caron, */ + case 1731202952: + if (len == 6 /* && strncmp(ent, "Scaron") == 0 */) { + return 352; + } + break; + /* scaron 353 latin small letter s with caron, */ + case 3135314728: + if (len == 6 /* && strncmp(ent, "scaron") == 0 */) { + return 353; + } + break; + /* Yuml 376 latin capital letter Y with diaeresis, */ + case 1123077683: + if (len == 4 /* && strncmp(ent, "Yuml") == 0 */) { + return 376; + } + break; + /* circ 710 modifier letter circumflex accent, */ + case 94756433: + if (len == 4 /* && strncmp(ent, "circ") == 0 */) { + return 710; + } + break; + /* tilde 732 small tilde, U+02DC ISOdia */ + case 1748508313: + if (len == 5 /* && strncmp(ent, "tilde") == 0 */) { + return 732; + } + break; + /* ensp 8194 en space, U+2002 ISOpub */ + case 3630901474: + if (len == 4 /* && strncmp(ent, "ensp") == 0 */) { + return 8194; + } + break; + /* emsp 8195 em space, U+2003 ISOpub */ + case 3241331769: + if (len == 4 /* && strncmp(ent, "emsp") == 0 */) { + return 8195; + } + break; + /* thinsp 8201 thin space, U+2009 ISOpub */ + case 2997658516: + if (len == 6 /* && strncmp(ent, "thinsp") == 0 */) { + return 8201; + } + break; + /* zwnj 8204 zero width non-joiner, */ + case 166021829: + if (len == 4 /* && strncmp(ent, "zwnj") == 0 */) { + return 8204; + } + break; + /* zwj 8205 zero width joiner, U+200D NEW RFC 2070 */ + case 4000813032: + if (len == 3 /* && strncmp(ent, "zwj") == 0 */) { + return 8205; + } + break; + /* lrm 8206 left-to-right mark, U+200E NEW RFC 2070 */ + case 2833481836: + if (len == 3 /* && strncmp(ent, "lrm") == 0 */) { + return 8206; + } + break; + /* rlm 8207 right-to-left mark, U+200F NEW RFC 2070 */ + case 865945620: + if (len == 3 /* && strncmp(ent, "rlm") == 0 */) { + return 8207; + } + break; + /* ndash 8211 en dash, U+2013 ISOpub */ + case 3305143245: + if (len == 5 /* && strncmp(ent, "ndash") == 0 */) { + return 8211; + } + break; + /* mdash 8212 em dash, U+2014 ISOpub */ + case 3146159164: + if (len == 5 /* && strncmp(ent, "mdash") == 0 */) { + return 8212; + } + break; + /* lsquo 8216 left single quotation mark, */ + case 1796006423: + if (len == 5 /* && strncmp(ent, "lsquo") == 0 */) { + return 8216; + } + break; + /* rsquo 8217 right single quotation mark, */ + case 2749910909: + if (len == 5 /* && strncmp(ent, "rsquo") == 0 */) { + return 8217; + } + break; + /* sbquo 8218 single low-9 quotation mark, U+201A NEW */ + case 159941417: + if (len == 5 /* && strncmp(ent, "sbquo") == 0 */) { + return 8218; + } + break; + /* ldquo 8220 left double quotation mark, */ + case 633684828: + if (len == 5 /* && strncmp(ent, "ldquo") == 0 */) { + return 8220; + } + break; + /* rdquo 8221 right double quotation mark, */ + case 1587589314: + if (len == 5 /* && strncmp(ent, "rdquo") == 0 */) { + return 8221; + } + break; + /* bdquo 8222 double low-9 quotation mark, U+201E NEW */ + case 3338811314: + if (len == 5 /* && strncmp(ent, "bdquo") == 0 */) { + return 8222; + } + break; + /* dagger 8224 dagger, U+2020 ISOpub */ + case 3288241744: + if (len == 6 /* && strncmp(ent, "dagger") == 0 */) { + return 8224; + } + break; + /* Dagger 8225 double dagger, U+2021 ISOpub */ + case 1884129968: + if (len == 6 /* && strncmp(ent, "Dagger") == 0 */) { + return 8225; + } + break; + /* permil 8240 per mille sign, U+2030 ISOtech */ + case 4246983035: + if (len == 6 /* && strncmp(ent, "permil") == 0 */) { + return 8240; + } + break; + /* lsaquo 8249 single left-pointing angle quotation mark, */ + case 2442191187: + if (len == 6 /* && strncmp(ent, "lsaquo") == 0 */) { + return 8249; + } + break; + /* rsaquo 8250 single right-pointing angle quotation mark, */ + case 2437026689: + if (len == 6 /* && strncmp(ent, "rsaquo") == 0 */) { + return 8250; + } + break; + /* euro 8364 euro sign, U+20AC NEW */ + case 2061257587: + if (len == 4 /* && strncmp(ent, "euro") == 0 */) { + return 8364; + } + break; + } + /* unknown */ + return -1; +} diff --git a/src/htsentities.sh b/src/htsentities.sh new file mode 100755 index 0000000..5039aee --- /dev/null +++ b/src/htsentities.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# + +src=html40.txt +url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt +dest=htsentities.h + +( + cat <<EOF +/* + -- ${dest} -- + FILE GENERATED BY $0, DO NOT MODIFY + + We compute the LCG hash + (see <http://en.wikipedia.org/wiki/Linear_congruential_generator>) + for each entity. We should in theory check using strncmp() that we + actually have the correct entity, but this is actually statistically + not needed. + + We may want to do better, but we expect the hash function to be uniform, and + let the compiler be smart enough to optimize the switch (for example by + checking in log2() intervals) + + This code has been generated using the evil $0 script. +*/ + +static int decode_entity(const unsigned int hash, const size_t len) { + switch(hash) { +EOF + ( + if test -f ${src}; then + cat ${src} + else + GET "${url}" + fi + ) \ + | grep -E '^<!ENTITY [a-zA-Z0-9_]' \ + | sed \ + -e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \ + -e 's/-->$//' \ + -e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/'\ +| ( \ + read A + while test -n "$A"; do + ent="${A%% *}" + code=$(echo "$A"|cut -f2 -d' ') + # compute hash + hash=0 + i=0 + a=1664525 + c=1013904223 + m="$[1 << 32]" + while test "$i" -lt ${#ent}; do + d="$(echo -n "${ent:${i}:1}"|hexdump -v -e '/1 "%d"')" + hash="$[((${hash}*${a})%(${m})+${d}+${c})%(${m})]" + i=$[${i}+1] + done + echo -e " /* $A */" + echo -e " case ${hash}:" + echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {" + echo -e " return ${code};" + echo -e " }" + echo -e " break;" + + # next + read A + done + ) + cat <<EOF + } + /* unknown */ + return -1; +} +EOF +) > ${dest} diff --git a/src/htslib.c b/src/htslib.c index fe9f240..8b53b88 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -54,6 +54,7 @@ Please visit our Website: http://www.httrack.com #include "htsmd5.h" #include "htsmodules.h" #include "htscharset.h" +#include "htsencoding.h" #ifdef _WIN32 #ifndef _WIN32_WCE @@ -3737,255 +3738,6 @@ void code64(unsigned char *a, int size_a, unsigned char *b, int crlf) { *b++ = '\0'; } -// remplacer " par " etc.. -// buffer MAX 1Ko -#define strcmpbeg(a, b) strncmp(a, b, strlen(b)) -HTSEXT_API void unescape_amp(char *s) { - while(*s) { - if (*s == '&') { - char *end = strchr(s, ';'); - - if (end && (((int) (end - s)) <= 8)) { - unsigned char c = 0; - - // http://www.w3.org/TR/xhtml-modularization/dtd_module_defs.html - if (strcmpbeg(s, "&#") == 0) { - int num = 0; - - if ((s[2] == 'x') || (s[2] == 'X')) { - if (sscanf(s + 3, "%x", &num) == 1 && num <= 0xff) { - c = (unsigned char) num; - } - } else { - if (sscanf(s + 2, "%d", &num) == 1 && num <= 0xff) { - c = (unsigned char) num; - } - } - } else if (strcmpbeg(s, " ") == 0) - c = 32; // hack - c=160; - else if (strcmpbeg(s, "¡") == 0) - c = 161; - else if (strcmpbeg(s, "¢") == 0) - c = 162; - else if (strcmpbeg(s, "£") == 0) - c = 163; - else if (strcmpbeg(s, "¤") == 0) - c = 164; - else if (strcmpbeg(s, "¥") == 0) - c = 165; - else if (strcmpbeg(s, "¦") == 0) - c = 166; - else if (strcmpbeg(s, "§") == 0) - c = 167; - else if (strcmpbeg(s, "¨") == 0) - c = 168; - else if (strcmpbeg(s, "©") == 0) - c = 169; - else if (strcmpbeg(s, "ª") == 0) - c = 170; - //else if (strcmpbeg(s, "«")==0) - // c=171; - else if (strcmpbeg(s, "¬") == 0) - c = 172; - //else if (strcmpbeg(s, "­")==0) - // c=173; - else if (strcmpbeg(s, "®") == 0) - c = 174; - else if (strcmpbeg(s, "¯") == 0) - c = 175; - else if (strcmpbeg(s, "°") == 0) - c = 176; - else if (strcmpbeg(s, "±") == 0) - c = 177; - else if (strcmpbeg(s, "²") == 0) - c = 178; - else if (strcmpbeg(s, "³") == 0) - c = 179; - else if (strcmpbeg(s, "´") == 0) - c = 180; - else if (strcmpbeg(s, "µ") == 0) - c = 181; - else if (strcmpbeg(s, "¶") == 0) - c = 182; - else if (strcmpbeg(s, "·") == 0) - c = 183; - else if (strcmpbeg(s, "¸") == 0) - c = 184; - else if (strcmpbeg(s, "¹") == 0) - c = 185; - else if (strcmpbeg(s, "º") == 0) - c = 186; - //else if (strcmpbeg(s, "»")==0) - // c=187; - else if (strcmpbeg(s, "¼") == 0) - c = 188; - else if (strcmpbeg(s, "½") == 0) - c = 189; - else if (strcmpbeg(s, "¾") == 0) - c = 190; - else if (strcmpbeg(s, "¿") == 0) - c = 191; - else if (strcmpbeg(s, "À") == 0) - c = 192; - else if (strcmpbeg(s, "Á") == 0) - c = 193; - else if (strcmpbeg(s, "Â") == 0) - c = 194; - else if (strcmpbeg(s, "Ã") == 0) - c = 195; - else if (strcmpbeg(s, "Ä") == 0) - c = 196; - else if (strcmpbeg(s, "Å") == 0) - c = 197; - else if (strcmpbeg(s, "Æ") == 0) - c = 198; - else if (strcmpbeg(s, "Ç") == 0) - c = 199; - else if (strcmpbeg(s, "È") == 0) - c = 200; - else if (strcmpbeg(s, "É") == 0) - c = 201; - else if (strcmpbeg(s, "Ê") == 0) - c = 202; - else if (strcmpbeg(s, "Ë") == 0) - c = 203; - else if (strcmpbeg(s, "Ì") == 0) - c = 204; - else if (strcmpbeg(s, "Í") == 0) - c = 205; - else if (strcmpbeg(s, "Î") == 0) - c = 206; - else if (strcmpbeg(s, "Ï") == 0) - c = 207; - else if (strcmpbeg(s, "Ð") == 0) - c = 208; - else if (strcmpbeg(s, "Ñ") == 0) - c = 209; - else if (strcmpbeg(s, "Ò") == 0) - c = 210; - else if (strcmpbeg(s, "Ó") == 0) - c = 211; - else if (strcmpbeg(s, "Ô") == 0) - c = 212; - else if (strcmpbeg(s, "Õ") == 0) - c = 213; - else if (strcmpbeg(s, "Ö") == 0) - c = 214; - else if (strcmpbeg(s, "×") == 0) - c = 215; - else if (strcmpbeg(s, "Ø") == 0) - c = 216; - else if (strcmpbeg(s, "Ù") == 0) - c = 217; - else if (strcmpbeg(s, "Ú") == 0) - c = 218; - else if (strcmpbeg(s, "Û") == 0) - c = 219; - else if (strcmpbeg(s, "Ü") == 0) - c = 220; - else if (strcmpbeg(s, "Ý") == 0) - c = 221; - else if (strcmpbeg(s, "Þ") == 0) - c = 222; - else if (strcmpbeg(s, "ß") == 0) - c = 223; - else if (strcmpbeg(s, "à") == 0) - c = 224; - else if (strcmpbeg(s, "á") == 0) - c = 225; - else if (strcmpbeg(s, "â") == 0) - c = 226; - else if (strcmpbeg(s, "ã") == 0) - c = 227; - else if (strcmpbeg(s, "ä") == 0) - c = 228; - else if (strcmpbeg(s, "å") == 0) - c = 229; - else if (strcmpbeg(s, "æ") == 0) - c = 230; - else if (strcmpbeg(s, "ç") == 0) - c = 231; - else if (strcmpbeg(s, "è") == 0) - c = 232; - else if (strcmpbeg(s, "é") == 0) - c = 233; - else if (strcmpbeg(s, "ê") == 0) - c = 234; - else if (strcmpbeg(s, "ë") == 0) - c = 235; - else if (strcmpbeg(s, "ì") == 0) - c = 236; - else if (strcmpbeg(s, "í") == 0) - c = 237; - else if (strcmpbeg(s, "î") == 0) - c = 238; - else if (strcmpbeg(s, "ï") == 0) - c = 239; - else if (strcmpbeg(s, "ð") == 0) - c = 240; - else if (strcmpbeg(s, "ñ") == 0) - c = 241; - else if (strcmpbeg(s, "ò") == 0) - c = 242; - else if (strcmpbeg(s, "ó") == 0) - c = 243; - else if (strcmpbeg(s, "ô") == 0) - c = 244; - else if (strcmpbeg(s, "õ") == 0) - c = 245; - else if (strcmpbeg(s, "ö") == 0) - c = 246; - else if (strcmpbeg(s, "÷") == 0) - c = 247; - else if (strcmpbeg(s, "ø") == 0) - c = 248; - else if (strcmpbeg(s, "ù") == 0) - c = 249; - else if (strcmpbeg(s, "ú") == 0) - c = 250; - else if (strcmpbeg(s, "û") == 0) - c = 251; - else if (strcmpbeg(s, "ü") == 0) - c = 252; - else if (strcmpbeg(s, "ý") == 0) - c = 253; - else if (strcmpbeg(s, "þ") == 0) - c = 254; - else if (strcmpbeg(s, "ÿ") == 0) - c = 255; - // - else if (strcmpbeg(s, "&") == 0) - c = '&'; - else if (strcmpbeg(s, ">") == 0) - c = '>'; - else if (strcmpbeg(s, "«") == 0) - c = '\"'; - else if (strcmpbeg(s, "<") == 0) - c = '<'; - else if (strcmpbeg(s, " ") == 0) - c = ' '; - else if (strcmpbeg(s, """) == 0) - c = '\"'; - else if (strcmpbeg(s, "»") == 0) - c = '\"'; - else if (strcmpbeg(s, "­") == 0) - c = '-'; - else if (strcmpbeg(s, "˜") == 0) - c = '~'; - // remplacer? - if (c) { - char BIGSTK buff[HTS_URLMAXSIZE * 2]; - - buff[0] = (char) c; - strcpybuff(buff + 1, end + 1); - strcpybuff(s, buff); - } - } - } - s++; - } -} - static int ehexh(char c) { if ((c >= '0') && (c <= '9')) return c - '0'; @@ -4000,6 +3752,12 @@ static int ehex(const char *s) { return 16 * ehexh(*s) + ehexh(*(s + 1)); } +void unescape_amp(char *s) { + if (hts_unescape_entities(s, s, strlen(s) + 1) != 0) { + assertf(! "error escaping html entities"); + } +} + // remplacer %20 par ' ', | par : etc.. // buffer MAX 1Ko HTSEXT_API char *unescape_http(char *catbuff, const char *s) { diff --git a/src/htsparse.c b/src/htsparse.c index 419d882..52445b3 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -2092,11 +2092,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { *a = '\0'; } else query[0] = '\0'; - // conversion & -> & et autres joyeusetés - unescape_amp(lien); - unescape_amp(query); // décoder l'inutile (%2E par exemple) et coder espaces - // Unescape high-chars foir UTF-8 conversion + // Unescape high-chars for UTF-8 conversion strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset)); /* note: '%' is still escaped */ escape_remove_control(lien); // ???? No! escape_spc_url(lien); @@ -2115,6 +2112,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { free(s); } } + // conversion entities + unescape_amp(lien); + unescape_amp(query); } // convertir les éventuels \ en des / pour éviter des problèmes de reconnaissance! |