summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-05-30 19:04:51 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-05-30 19:04:51 +0000
commit850f165f4ac90a6e6687c392ddfdd0c6a05b3fe5 (patch)
tree7cac2f01468639c4ab63fe523c17d7638e8cd2ac /src
parent01af2a5e73f53ebf8a092e4bda77cd1326c1da11 (diff)
Added hts_unescape_entities(), a rewrite of the HTML entities decoder.
Fixed HTML entities decoding which was done before charset decoding.
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am10
-rw-r--r--src/Makefile.in22
-rw-r--r--src/htscoremain.c17
-rw-r--r--src/htsencoding.c176
-rw-r--r--src/htsencoding.h55
-rw-r--r--src/htsentities.h1535
-rwxr-xr-xsrc/htsentities.sh75
-rw-r--r--src/htslib.c256
-rw-r--r--src/htsparse.c8
9 files changed, 1885 insertions, 269 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 0901f6d..59a6778 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -25,8 +25,8 @@ INCLUDES = \
bin_PROGRAMS = proxytrack httrack htsserver
-httrack_LDADD = $(THREADS_LIBS) $(OPENSSL_LIBS) libhttrack.la
-htsserver_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) $(OPENSSL_LIBS) libhttrack.la
+httrack_LDADD = $(THREADS_LIBS) libhttrack.la
+htsserver_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) libhttrack.la
proxytrack_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS)
proxytrack_CFLAGS = $(AM_CFLAGS) -DNO_MALLOCT
@@ -47,7 +47,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
htsname.c htsrobots.c htstools.c htswizard.c \
htsalias.c htsthread.c htsindex.c htsbauth.c \
htsmd5.c htszlib.c htswrap.c \
- htsmodules.c htscharset.c punycode.c \
+ htsmodules.c htscharset.c punycode.c htsencoding.c \
md5.c \
htsmms.c \
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
@@ -60,7 +60,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
htsmodules.h htsname.h htsnet.h \
htsopt.h htsrobots.h htsthread.h \
htstools.h htswizard.h htswrap.h htszlib.h \
- htsstrings.h httrack-library.h htscharset.h punycode.h \
+ htsstrings.h httrack-library.h htscharset.h punycode.h htsencoding.h \
md5.h \
htsmms.h \
minizip/crypt.h minizip/ioapi.h minizip/mztools.h minizip/unzip.h minizip/zip.h \
@@ -69,7 +69,7 @@ libhttrack_la_LIBADD = $(THREADS_LIBS) $(ZLIB_LIBS) $(OPENSSL_LIBS) $(DL_LIBS) $
libhttrack_la_LDFLAGS = -version-info $(VERSION_INFO)
libhtsjava_la_SOURCES = htsjava.c htsjava.h
-libhtsjava_la_LIBADD = $(THREADS_LIBS) $(OPENSSL_LIBS) $(DL_LIBS) libhttrack.la
+libhtsjava_la_LIBADD = $(THREADS_LIBS) $(DL_LIBS) libhttrack.la
libhtsjava_la_LDFLAGS = -version-info $(VERSION_INFO)
EXTRA_DIST = httrack.h webhttrack \
diff --git a/src/Makefile.in b/src/Makefile.in
index 30a1e93..9a943c1 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -67,7 +67,7 @@ libLTLIBRARIES_INSTALL = $(INSTALL)
LTLIBRARIES = $(lib_LTLIBRARIES)
am__DEPENDENCIES_1 =
libhtsjava_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
- $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) libhttrack.la
+ $(am__DEPENDENCIES_1) libhttrack.la
am_libhtsjava_la_OBJECTS = htsjava.lo
libhtsjava_la_OBJECTS = $(am_libhtsjava_la_OBJECTS)
libhttrack_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
@@ -78,19 +78,18 @@ am_libhttrack_la_OBJECTS = htscore.lo htsparse.lo htsback.lo \
htsinthash.lo htshelp.lo htslib.lo htscoremain.lo htsname.lo \
htsrobots.lo htstools.lo htswizard.lo htsalias.lo htsthread.lo \
htsindex.lo htsbauth.lo htsmd5.lo htszlib.lo htswrap.lo \
- htsmodules.lo htscharset.lo punycode.lo md5.lo htsmms.lo \
- ioapi.lo mztools.lo unzip.lo zip.lo error.lo mms.lo
+ htsmodules.lo htscharset.lo punycode.lo htsencoding.lo md5.lo \
+ htsmms.lo ioapi.lo mztools.lo unzip.lo zip.lo error.lo mms.lo
libhttrack_la_OBJECTS = $(am_libhttrack_la_OBJECTS)
binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
PROGRAMS = $(bin_PROGRAMS)
am_htsserver_OBJECTS = htsserver.$(OBJEXT) htsweb.$(OBJEXT)
htsserver_OBJECTS = $(am_htsserver_OBJECTS)
htsserver_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
- $(am__DEPENDENCIES_1) libhttrack.la
+ libhttrack.la
httrack_SOURCES = httrack.c
httrack_OBJECTS = httrack.$(OBJEXT)
-httrack_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
- libhttrack.la
+httrack_DEPENDENCIES = $(am__DEPENDENCIES_1) libhttrack.la
am_proxytrack_OBJECTS = proxytrack-main.$(OBJEXT) \
proxytrack-proxytrack.$(OBJEXT) proxytrack-store.$(OBJEXT) \
proxytrack-htsinthash.$(OBJEXT) proxytrack-htsmd5.$(OBJEXT) \
@@ -270,8 +269,8 @@ INCLUDES = \
-DDATADIR=\""$(datadir)"\" \
-DLIBDIR=\""$(libdir)"\"
-httrack_LDADD = $(THREADS_LIBS) $(OPENSSL_LIBS) libhttrack.la
-htsserver_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) $(OPENSSL_LIBS) libhttrack.la
+httrack_LDADD = $(THREADS_LIBS) libhttrack.la
+htsserver_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS) libhttrack.la
proxytrack_LDADD = $(THREADS_LIBS) $(SOCKET_LIBS)
proxytrack_CFLAGS = $(AM_CFLAGS) -DNO_MALLOCT
lib_LTLIBRARIES = libhttrack.la libhtsjava.la
@@ -289,7 +288,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
htsname.c htsrobots.c htstools.c htswizard.c \
htsalias.c htsthread.c htsindex.c htsbauth.c \
htsmd5.c htszlib.c htswrap.c \
- htsmodules.c htscharset.c punycode.c \
+ htsmodules.c htscharset.c punycode.c htsencoding.c \
md5.c \
htsmms.c \
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
@@ -302,7 +301,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
htsmodules.h htsname.h htsnet.h \
htsopt.h htsrobots.h htsthread.h \
htstools.h htswizard.h htswrap.h htszlib.h \
- htsstrings.h httrack-library.h htscharset.h punycode.h \
+ htsstrings.h httrack-library.h htscharset.h punycode.h htsencoding.h \
md5.h \
htsmms.h \
minizip/crypt.h minizip/ioapi.h minizip/mztools.h minizip/unzip.h minizip/zip.h \
@@ -311,7 +310,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
libhttrack_la_LIBADD = $(THREADS_LIBS) $(ZLIB_LIBS) $(OPENSSL_LIBS) $(DL_LIBS) $(SOCKET_LIBS) $(ICONV_LIBS)
libhttrack_la_LDFLAGS = -version-info $(VERSION_INFO)
libhtsjava_la_SOURCES = htsjava.c htsjava.h
-libhtsjava_la_LIBADD = $(THREADS_LIBS) $(OPENSSL_LIBS) $(DL_LIBS) libhttrack.la
+libhtsjava_la_LIBADD = $(THREADS_LIBS) $(DL_LIBS) libhttrack.la
libhtsjava_la_LDFLAGS = -version-info $(VERSION_INFO)
EXTRA_DIST = httrack.h webhttrack \
minizip/ChangeLogUnzip \
@@ -468,6 +467,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htscharset.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htscore.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htscoremain.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htsencoding.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htsfilters.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htsftp.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htshash.Plo@am__quote@
diff --git a/src/htscoremain.c b/src/htscoremain.c
index b918ed0..534c469 100644
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -2342,6 +2342,23 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) {
htsmain_free();
return 0;
break;
+ case '6': // entities: httrack -#6
+ if (++na < argc) {
+ char *const s = strdup(argv[na]);
+ if (s != NULL && hts_unescape_entities(s, s, strlen(s)) == 0) {
+ printf("%s\n", s);
+ free(s);
+ } else {
+ fprintf(stderr, "invalid string '%s'\n", argv[na]);
+ }
+ na += 1;
+ } else {
+ fprintf(stderr,
+ "Option #6 needs to be followed by a string");
+ }
+ htsmain_free();
+ return 0;
+ break;
case '!':
if (na + 1 >= argc) {
HTS_PANIC_PRINTF
diff --git a/src/htsencoding.c b/src/htsencoding.c
new file mode 100644
index 0000000..46c57a4
--- /dev/null
+++ b/src/htsencoding.c
@@ -0,0 +1,176 @@
+/* ------------------------------------------------------------ */
+/*
+HTTrack Website Copier, Offline Browser for Windows and Unix
+Copyright (C) Xavier Roche and other contributors
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 3
+of the License, or any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+Important notes:
+
+- We hereby ask people using this source NOT to use it in purpose of grabbing
+emails addresses, or collecting any other private information on persons.
+This would disgrace our work, and spoil the many hours we spent on it.
+
+Please visit our Website: http://www.httrack.com
+*/
+
+/* ------------------------------------------------------------ */
+/* File: Encoding conversion functions */
+/* Author: Xavier Roche */
+/* ------------------------------------------------------------ */
+
+#include "htscharset.h"
+#include "htsencoding.h"
+
+/* static int decode_entity(const unsigned int hash, const size_t len);
+*/
+#include "htsentities.h"
+
+/* hexadecimal conversion */
+static int get_hex_value(char c) {
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else if (c >= 'a' && c <= 'f')
+ return (c - 'a' + 10);
+ else if (c >= 'A' && c <= 'F')
+ return (c - 'A' + 10);
+ else
+ return -1;
+}
+
+/* Numerical Recipes,
+ see <http://en.wikipedia.org/wiki/Linear_congruential_generator> */
+#define HASH_PRIME ( 1664525 )
+#define HASH_CONST ( 1013904223 )
+#define HASH_ADD(HASH, C) do { \
+ (HASH) *= HASH_PRIME; \
+ (HASH) += HASH_CONST; \
+ (HASH) += (C); \
+ } while(0)
+
+int hts_unescape_entities(const char *src, char *dest, const size_t max) {
+ size_t i, j, ampStart, ampStartDest;
+ int uc;
+ int hex;
+ unsigned int hash;
+ for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0,
+ uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) {
+ /* start of entity */
+ if (src[i] == '&') {
+ ampStart = i;
+ ampStartDest = j;
+ hash = 0;
+ uc = -1;
+ }
+ /* inside a potential entity */
+ else if (ampStart != (size_t) -1) {
+ /* &#..; entity */
+ if (ampStart + 1 == i && src[ampStart + 1] == '#') {
+ uc = 0;
+ hex = 0;
+ }
+ /* &#x..; entity */
+ else if (ampStart + 2 == i && src[ampStart + 1] == '#'
+ && src[ampStart + 2] == 'x') {
+ hex = 1;
+ }
+ /* end of entity */
+ else if (src[i] == ';') {
+ size_t len;
+
+ /* decode entity */
+ if (uc == -1) {
+ /* &foo; */
+ uc = decode_entity(hash, /*&src[ampStart + 1],*/
+ i - ampStart - 1);
+ /* FIXME: TEMPORARY HACK FROM PREVIOUS VERSION TO BE INVESTIGATED */
+ if (uc == 160) {
+ uc = 32;
+ }
+ }
+
+ /* end */
+ ampStart = (size_t) -1;
+
+ /* success ? */
+ if (uc > 0) {
+ /* write at position */
+ len = hts_writeUTF8(uc, &dest[ampStartDest], max - ampStartDest);
+ if (len > 0) {
+ /* new dest position */
+ j = ampStartDest + len;
+ }
+ /* do not copy ; */
+ continue;
+ }
+ }
+ /* numerical entity */
+ else if (uc != -1) {
+ /* decimal */
+ if (!hex) {
+ if (src[i] >= '0' && src[i] <= '9') {
+ const int h = src[i] - '0';
+ uc *= 10;
+ uc += h;
+ } else {
+ /* abandon */
+ ampStart = (size_t) -1;
+ }
+ }
+ /* hex */
+ else {
+ const int h = get_hex_value(src[i]);
+ if (h != -1) {
+ uc *= 16;
+ uc += h;
+ } else {
+ /* abandon */
+ ampStart = (size_t) -1;
+ }
+ }
+ }
+ /* alphanumerical entity */
+ else {
+ /* alphanum and not too far ('&thetasym;' is the longest) */
+ if (i <= ampStart + 10 &&
+ (
+ (src[i] >= '0' && src[i] <= '9')
+ || (src[i] >= 'A' && src[i] <= 'Z')
+ || (src[i] >= 'a' && src[i] <= 'z')
+ )
+ ) {
+ /* compute hash */
+ HASH_ADD(hash, (unsigned char) src[i]);
+ } else {
+ /* abandon */
+ ampStart = (size_t) -1;
+ }
+ }
+ }
+
+ /* copy */
+ if (j + 1 > max) {
+ /* overflow */
+ return -1;
+ }
+ if (src != dest || i != j) {
+ dest[j] = src[i];
+ }
+ j++;
+ }
+ dest[j] = '\0';
+
+ return 0;
+}
diff --git a/src/htsencoding.h b/src/htsencoding.h
new file mode 100644
index 0000000..4dfd367
--- /dev/null
+++ b/src/htsencoding.h
@@ -0,0 +1,55 @@
+/* ------------------------------------------------------------ */
+/*
+HTTrack Website Copier, Offline Browser for Windows and Unix
+Copyright (C) Xavier Roche and other contributors
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 3
+of the License, or any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+Important notes:
+
+- We hereby ask people using this source NOT to use it in purpose of grabbing
+emails addresses, or collecting any other private information on persons.
+This would disgrace our work, and spoil the many hours we spent on it.
+
+Please visit our Website: http://www.httrack.com
+*/
+
+/* ------------------------------------------------------------ */
+/* File: Encoding conversion functions */
+/* Author: Xavier Roche */
+/* ------------------------------------------------------------ */
+
+#ifndef HTS_CHARSET_DEFH
+#define HTS_CHARSET_DEFH
+
+/** Standard includes. **/
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+/**
+ * Unescape HTML entities (as per HTML 4.0 Specification)
+ * and replace them in-place by their UTF-8 equivalents.
+ * Note: source and destination may be the same, and the destination only
+ * needs to hold as space as the source.
+ * Returns 0 upon success.
+ **/
+extern int hts_unescape_entities(const char *src,
+ char *dest, const size_t max);
+
+#endif
+
diff --git a/src/htsentities.h b/src/htsentities.h
new file mode 100644
index 0000000..bbb3a06
--- /dev/null
+++ b/src/htsentities.h
@@ -0,0 +1,1535 @@
+/*
+ -- htsentities.h --
+ FILE GENERATED BY ./htsentities.sh, DO NOT MODIFY
+
+ We compute the LCG hash
+ (see <http://en.wikipedia.org/wiki/Linear_congruential_generator>)
+ for each entity. We should in theory check using strncmp() that we
+ actually have the correct entity, but this is actually statistically
+ not needed.
+
+ We may want to do better, but we expect the hash function to be uniform, and
+ let the compiler be smart enough to optimize the switch (for example by
+ checking in log2() intervals)
+
+ This code has been generated using the evil ./htsentities.sh script.
+*/
+
+static int decode_entity(const unsigned int hash, const size_t len) {
+ switch(hash) {
+ /* nbsp 160 no-break space = non-breaking space, */
+ case 3948425267:
+ if (len == 4 /* && strncmp(ent, "nbsp") == 0 */) {
+ return 160;
+ }
+ break;
+ /* iexcl 161 inverted exclamation mark, U+00A1 ISOnum */
+ case 1499591408:
+ if (len == 5 /* && strncmp(ent, "iexcl") == 0 */) {
+ return 161;
+ }
+ break;
+ /* cent 162 cent sign, U+00A2 ISOnum */
+ case 2824786826:
+ if (len == 4 /* && strncmp(ent, "cent") == 0 */) {
+ return 162;
+ }
+ break;
+ /* pound 163 pound sign, U+00A3 ISOnum */
+ case 805305925:
+ if (len == 5 /* && strncmp(ent, "pound") == 0 */) {
+ return 163;
+ }
+ break;
+ /* curren 164 currency sign, U+00A4 ISOnum */
+ case 1584829677:
+ if (len == 6 /* && strncmp(ent, "curren") == 0 */) {
+ return 164;
+ }
+ break;
+ /* yen 165 yen sign = yuan sign, U+00A5 ISOnum */
+ case 3581281881:
+ if (len == 3 /* && strncmp(ent, "yen") == 0 */) {
+ return 165;
+ }
+ break;
+ /* brvbar 166 broken bar = broken vertical bar, */
+ case 3768851825:
+ if (len == 6 /* && strncmp(ent, "brvbar") == 0 */) {
+ return 166;
+ }
+ break;
+ /* sect 167 section sign, U+00A7 ISOnum */
+ case 2614630987:
+ if (len == 4 /* && strncmp(ent, "sect") == 0 */) {
+ return 167;
+ }
+ break;
+ /* uml 168 diaeresis = spacing diaeresis, */
+ case 2036319259:
+ if (len == 3 /* && strncmp(ent, "uml") == 0 */) {
+ return 168;
+ }
+ break;
+ /* copy 169 copyright sign, U+00A9 ISOnum */
+ case 2428845635:
+ if (len == 4 /* && strncmp(ent, "copy") == 0 */) {
+ return 169;
+ }
+ break;
+ /* ordf 170 feminine ordinal indicator, U+00AA ISOnum */
+ case 212470411:
+ if (len == 4 /* && strncmp(ent, "ordf") == 0 */) {
+ return 170;
+ }
+ break;
+ /* laquo 171 left-pointing double angle quotation mark */
+ case 401220509:
+ if (len == 5 /* && strncmp(ent, "laquo") == 0 */) {
+ return 171;
+ }
+ break;
+ /* not 172 not sign, U+00AC ISOnum */
+ case 3607627678:
+ if (len == 3 /* && strncmp(ent, "not") == 0 */) {
+ return 172;
+ }
+ break;
+ /* shy 173 soft hyphen = discretionary hyphen, */
+ case 1248857237:
+ if (len == 3 /* && strncmp(ent, "shy") == 0 */) {
+ return 173;
+ }
+ break;
+ /* reg 174 registered sign = registered trade mark sign, */
+ case 854293939:
+ if (len == 3 /* && strncmp(ent, "reg") == 0 */) {
+ return 174;
+ }
+ break;
+ /* macr 175 macron = spacing macron = overline */
+ case 591423527:
+ if (len == 4 /* && strncmp(ent, "macr") == 0 */) {
+ return 175;
+ }
+ break;
+ /* deg 176 degree sign, U+00B0 ISOnum */
+ case 3990252661:
+ if (len == 3 /* && strncmp(ent, "deg") == 0 */) {
+ return 176;
+ }
+ break;
+ /* plusmn 177 plus-minus sign = plus-or-minus sign, */
+ case 3641444957:
+ if (len == 6 /* && strncmp(ent, "plusmn") == 0 */) {
+ return 177;
+ }
+ break;
+ /* sup2 178 superscript two = superscript digit two */
+ case 279450434:
+ if (len == 4 /* && strncmp(ent, "sup2") == 0 */) {
+ return 178;
+ }
+ break;
+ /* sup3 179 superscript three = superscript digit three */
+ case 279450435:
+ if (len == 4 /* && strncmp(ent, "sup3") == 0 */) {
+ return 179;
+ }
+ break;
+ /* acute 180 acute accent = spacing acute, */
+ case 1795641881:
+ if (len == 5 /* && strncmp(ent, "acute") == 0 */) {
+ return 180;
+ }
+ break;
+ /* micro 181 micro sign, U+00B5 ISOnum */
+ case 1447763057:
+ if (len == 5 /* && strncmp(ent, "micro") == 0 */) {
+ return 181;
+ }
+ break;
+ /* para 182 pilcrow sign = paragraph sign, */
+ case 848855704:
+ if (len == 4 /* && strncmp(ent, "para") == 0 */) {
+ return 182;
+ }
+ break;
+ /* middot 183 middle dot = Georgian comma */
+ case 3167839463:
+ if (len == 6 /* && strncmp(ent, "middot") == 0 */) {
+ return 183;
+ }
+ break;
+ /* cedil 184 cedilla = spacing cedilla, U+00B8 ISOdia */
+ case 1354214564:
+ if (len == 5 /* && strncmp(ent, "cedil") == 0 */) {
+ return 184;
+ }
+ break;
+ /* sup1 185 superscript one = superscript digit one, */
+ case 279450433:
+ if (len == 4 /* && strncmp(ent, "sup1") == 0 */) {
+ return 185;
+ }
+ break;
+ /* ordm 186 masculine ordinal indicator, */
+ case 212470418:
+ if (len == 4 /* && strncmp(ent, "ordm") == 0 */) {
+ return 186;
+ }
+ break;
+ /* raquo 187 right-pointing double angle quotation mark */
+ case 1355124995:
+ if (len == 5 /* && strncmp(ent, "raquo") == 0 */) {
+ return 187;
+ }
+ break;
+ /* frac14 188 vulgar fraction one quarter */
+ case 1016175271:
+ if (len == 6 /* && strncmp(ent, "frac14") == 0 */) {
+ return 188;
+ }
+ break;
+ /* frac12 189 vulgar fraction one half */
+ case 1016175269:
+ if (len == 6 /* && strncmp(ent, "frac12") == 0 */) {
+ return 189;
+ }
+ break;
+ /* frac34 190 vulgar fraction three quarters */
+ case 1019504321:
+ if (len == 6 /* && strncmp(ent, "frac34") == 0 */) {
+ return 190;
+ }
+ break;
+ /* iquest 191 inverted question mark */
+ case 430057661:
+ if (len == 6 /* && strncmp(ent, "iquest") == 0 */) {
+ return 191;
+ }
+ break;
+ /* Agrave 192 latin capital letter A with grave */
+ case 2815520320:
+ if (len == 6 /* && strncmp(ent, "Agrave") == 0 */) {
+ return 192;
+ }
+ break;
+ /* Aacute 193 latin capital letter A with acute, */
+ case 4192391993:
+ if (len == 6 /* && strncmp(ent, "Aacute") == 0 */) {
+ return 193;
+ }
+ break;
+ /* Acirc 194 latin capital letter A with circumflex, */
+ case 619920369:
+ if (len == 5 /* && strncmp(ent, "Acirc") == 0 */) {
+ return 194;
+ }
+ break;
+ /* Atilde 195 latin capital letter A with tilde, */
+ case 4145258425:
+ if (len == 6 /* && strncmp(ent, "Atilde") == 0 */) {
+ return 195;
+ }
+ break;
+ /* Auml 196 latin capital letter A with diaeresis, */
+ case 3558330427:
+ if (len == 4 /* && strncmp(ent, "Auml") == 0 */) {
+ return 196;
+ }
+ break;
+ /* Aring 197 latin capital letter A with ring above */
+ case 1775583868:
+ if (len == 5 /* && strncmp(ent, "Aring") == 0 */) {
+ return 197;
+ }
+ break;
+ /* AElig 198 latin capital letter AE */
+ case 3743972869:
+ if (len == 5 /* && strncmp(ent, "AElig") == 0 */) {
+ return 198;
+ }
+ break;
+ /* Ccedil 199 latin capital letter C with cedilla, */
+ case 885931646:
+ if (len == 6 /* && strncmp(ent, "Ccedil") == 0 */) {
+ return 199;
+ }
+ break;
+ /* Egrave 200 latin capital letter E with grave, */
+ case 1380421556:
+ if (len == 6 /* && strncmp(ent, "Egrave") == 0 */) {
+ return 200;
+ }
+ break;
+ /* Eacute 201 latin capital letter E with acute, */
+ case 2757293229:
+ if (len == 6 /* && strncmp(ent, "Eacute") == 0 */) {
+ return 201;
+ }
+ break;
+ /* Ecirc 202 latin capital letter E with circumflex, */
+ case 1255856693:
+ if (len == 5 /* && strncmp(ent, "Ecirc") == 0 */) {
+ return 202;
+ }
+ break;
+ /* Euml 203 latin capital letter E with diaeresis, */
+ case 2436627087:
+ if (len == 4 /* && strncmp(ent, "Euml") == 0 */) {
+ return 203;
+ }
+ break;
+ /* Igrave 204 latin capital letter I with grave, */
+ case 4240290088:
+ if (len == 6 /* && strncmp(ent, "Igrave") == 0 */) {
+ return 204;
+ }
+ break;
+ /* Iacute 205 latin capital letter I with acute, */
+ case 1322194465:
+ if (len == 6 /* && strncmp(ent, "Iacute") == 0 */) {
+ return 205;
+ }
+ break;
+ /* Icirc 206 latin capital letter I with circumflex, */
+ case 1891793017:
+ if (len == 5 /* && strncmp(ent, "Icirc") == 0 */) {
+ return 206;
+ }
+ break;
+ /* Iuml 207 latin capital letter I with diaeresis, */
+ case 1314923747:
+ if (len == 4 /* && strncmp(ent, "Iuml") == 0 */) {
+ return 207;
+ }
+ break;
+ /* ETH 208 latin capital letter ETH, U+00D0 ISOlat1 */
+ case 475229442:
+ if (len == 3 /* && strncmp(ent, "ETH") == 0 */) {
+ return 208;
+ }
+ break;
+ /* Ntilde 209 latin capital letter N with tilde, */
+ case 2702412914:
+ if (len == 6 /* && strncmp(ent, "Ntilde") == 0 */) {
+ return 209;
+ }
+ break;
+ /* Ograve 210 latin capital letter O with grave, */
+ case 4235125590:
+ if (len == 6 /* && strncmp(ent, "Ograve") == 0 */) {
+ return 210;
+ }
+ break;
+ /* Oacute 211 latin capital letter O with acute, */
+ case 1317029967:
+ if (len == 6 /* && strncmp(ent, "Oacute") == 0 */) {
+ return 211;
+ }
+ break;
+ /* Ocirc 212 latin capital letter O with circumflex, */
+ case 2845697503:
+ if (len == 5 /* && strncmp(ent, "Ocirc") == 0 */) {
+ return 212;
+ }
+ break;
+ /* Otilde 213 latin capital letter O with tilde, */
+ case 1269896399:
+ if (len == 6 /* && strncmp(ent, "Otilde") == 0 */) {
+ return 213;
+ }
+ break;
+ /* Ouml 214 latin capital letter O with diaeresis, */
+ case 1779852385:
+ if (len == 4 /* && strncmp(ent, "Ouml") == 0 */) {
+ return 214;
+ }
+ break;
+ /* times 215 multiplication sign, U+00D7 ISOnum */
+ case 2139742557:
+ if (len == 5 /* && strncmp(ent, "times") == 0 */) {
+ return 215;
+ }
+ break;
+ /* Oslash 216 latin capital letter O with stroke */
+ case 1378045056:
+ if (len == 6 /* && strncmp(ent, "Oslash") == 0 */) {
+ return 216;
+ }
+ break;
+ /* Ugrave 217 latin capital letter U with grave, */
+ case 4229961092:
+ if (len == 6 /* && strncmp(ent, "Ugrave") == 0 */) {
+ return 217;
+ }
+ break;
+ /* Uacute 218 latin capital letter U with acute, */
+ case 1311865469:
+ if (len == 6 /* && strncmp(ent, "Uacute") == 0 */) {
+ return 218;
+ }
+ break;
+ /* Ucirc 219 latin capital letter U with circumflex, */
+ case 3799601989:
+ if (len == 5 /* && strncmp(ent, "Ucirc") == 0 */) {
+ return 219;
+ }
+ break;
+ /* Uuml 220 latin capital letter U with diaeresis, */
+ case 2244781023:
+ if (len == 4 /* && strncmp(ent, "Uuml") == 0 */) {
+ return 220;
+ }
+ break;
+ /* Yacute 221 latin capital letter Y with acute, */
+ case 4171734001:
+ if (len == 6 /* && strncmp(ent, "Yacute") == 0 */) {
+ return 221;
+ }
+ break;
+ /* THORN 222 latin capital letter THORN, */
+ case 4251263774:
+ if (len == 5 /* && strncmp(ent, "THORN") == 0 */) {
+ return 222;
+ }
+ break;
+ /* szlig 223 latin small letter sharp s = ess-zed, */
+ case 51833136:
+ if (len == 5 /* && strncmp(ent, "szlig") == 0 */) {
+ return 223;
+ }
+ break;
+ /* agrave 224 latin small letter a with grave */
+ case 4219632096:
+ if (len == 6 /* && strncmp(ent, "agrave") == 0 */) {
+ return 224;
+ }
+ break;
+ /* aacute 225 latin small letter a with acute, */
+ case 1301536473:
+ if (len == 6 /* && strncmp(ent, "aacute") == 0 */) {
+ return 225;
+ }
+ break;
+ /* acirc 226 latin small letter a with circumflex, */
+ case 1412443665:
+ if (len == 5 /* && strncmp(ent, "acirc") == 0 */) {
+ return 226;
+ }
+ break;
+ /* atilde 227 latin small letter a with tilde, */
+ case 1254402905:
+ if (len == 6 /* && strncmp(ent, "atilde") == 0 */) {
+ return 227;
+ }
+ break;
+ /* auml 228 latin small letter a with diaeresis, */
+ case 3174638299:
+ if (len == 4 /* && strncmp(ent, "auml") == 0 */) {
+ return 228;
+ }
+ break;
+ /* aring 229 latin small letter a with ring above */
+ case 2568107164:
+ if (len == 5 /* && strncmp(ent, "aring") == 0 */) {
+ return 229;
+ }
+ break;
+ /* aelig 230 latin small letter ae */
+ case 4152804037:
+ if (len == 5 /* && strncmp(ent, "aelig") == 0 */) {
+ return 230;
+ }
+ break;
+ /* ccedil 231 latin small letter c with cedilla, */
+ case 2290043422:
+ if (len == 6 /* && strncmp(ent, "ccedil") == 0 */) {
+ return 231;
+ }
+ break;
+ /* egrave 232 latin small letter e with grave, */
+ case 2784533332:
+ if (len == 6 /* && strncmp(ent, "egrave") == 0 */) {
+ return 232;
+ }
+ break;
+ /* eacute 233 latin small letter e with acute, */
+ case 4161405005:
+ if (len == 6 /* && strncmp(ent, "eacute") == 0 */) {
+ return 233;
+ }
+ break;
+ /* ecirc 234 latin small letter e with circumflex, */
+ case 2048379989:
+ if (len == 5 /* && strncmp(ent, "ecirc") == 0 */) {
+ return 234;
+ }
+ break;
+ /* euml 235 latin small letter e with diaeresis, */
+ case 2052934959:
+ if (len == 4 /* && strncmp(ent, "euml") == 0 */) {
+ return 235;
+ }
+ break;
+ /* igrave 236 latin small letter i with grave, */
+ case 1349434568:
+ if (len == 6 /* && strncmp(ent, "igrave") == 0 */) {
+ return 236;
+ }
+ break;
+ /* iacute 237 latin small letter i with acute, */
+ case 2726306241:
+ if (len == 6 /* && strncmp(ent, "iacute") == 0 */) {
+ return 237;
+ }
+ break;
+ /* icirc 238 latin small letter i with circumflex, */
+ case 2684316313:
+ if (len == 5 /* && strncmp(ent, "icirc") == 0 */) {
+ return 238;
+ }
+ break;
+ /* iuml 239 latin small letter i with diaeresis, */
+ case 931231619:
+ if (len == 4 /* && strncmp(ent, "iuml") == 0 */) {
+ return 239;
+ }
+ break;
+ /* eth 240 latin small letter eth, U+00F0 ISOlat1 */
+ case 109822946:
+ if (len == 3 /* && strncmp(ent, "eth") == 0 */) {
+ return 240;
+ }
+ break;
+ /* ntilde 241 latin small letter n with tilde, */
+ case 4106524690:
+ if (len == 6 /* && strncmp(ent, "ntilde") == 0 */) {
+ return 241;
+ }
+ break;
+ /* ograve 242 latin small letter o with grave, */
+ case 1344270070:
+ if (len == 6 /* && strncmp(ent, "ograve") == 0 */) {
+ return 242;
+ }
+ break;
+ /* oacute 243 latin small letter o with acute, */
+ case 2721141743:
+ if (len == 6 /* && strncmp(ent, "oacute") == 0 */) {
+ return 243;
+ }
+ break;
+ /* ocirc 244 latin small letter o with circumflex, */
+ case 3638220799:
+ if (len == 5 /* && strncmp(ent, "ocirc") == 0 */) {
+ return 244;
+ }
+ break;
+ /* otilde 245 latin small letter o with tilde, */
+ case 2674008175:
+ if (len == 6 /* && strncmp(ent, "otilde") == 0 */) {
+ return 245;
+ }
+ break;
+ /* ouml 246 latin small letter o with diaeresis, */
+ case 1396160257:
+ if (len == 4 /* && strncmp(ent, "ouml") == 0 */) {
+ return 246;
+ }
+ break;
+ /* divide 247 division sign, U+00F7 ISOnum */
+ case 2204943563:
+ if (len == 6 /* && strncmp(ent, "divide") == 0 */) {
+ return 247;
+ }
+ break;
+ /* oslash 248 latin small letter o with stroke, */
+ case 2782156832:
+ if (len == 6 /* && strncmp(ent, "oslash") == 0 */) {
+ return 248;
+ }
+ break;
+ /* ugrave 249 latin small letter u with grave, */
+ case 1339105572:
+ if (len == 6 /* && strncmp(ent, "ugrave") == 0 */) {
+ return 249;
+ }
+ break;
+ /* uacute 250 latin small letter u with acute, */
+ case 2715977245:
+ if (len == 6 /* && strncmp(ent, "uacute") == 0 */) {
+ return 250;
+ }
+ break;
+ /* ucirc 251 latin small letter u with circumflex, */
+ case 297157989:
+ if (len == 5 /* && strncmp(ent, "ucirc") == 0 */) {
+ return 251;
+ }
+ break;
+ /* uuml 252 latin small letter u with diaeresis, */
+ case 1861088895:
+ if (len == 4 /* && strncmp(ent, "uuml") == 0 */) {
+ return 252;
+ }
+ break;
+ /* yacute 253 latin small letter y with acute, */
+ case 1280878481:
+ if (len == 6 /* && strncmp(ent, "yacute") == 0 */) {
+ return 253;
+ }
+ break;
+ /* thorn 254 latin small letter thorn with, */
+ case 4294688446:
+ if (len == 5 /* && strncmp(ent, "thorn") == 0 */) {
+ return 254;
+ }
+ break;
+ /* yuml 255 latin small letter y with diaeresis, */
+ case 739385555:
+ if (len == 4 /* && strncmp(ent, "yuml") == 0 */) {
+ return 255;
+ }
+ break;
+ /* fnof 402 latin small f with hook = function */
+ case 2270075705:
+ if (len == 4 /* && strncmp(ent, "fnof") == 0 */) {
+ return 402;
+ }
+ break;
+ /* Alpha 913 greek capital letter alpha, U+0391 */
+ case 4027656009:
+ if (len == 5 /* && strncmp(ent, "Alpha") == 0 */) {
+ return 913;
+ }
+ break;
+ /* Beta 914 greek capital letter beta, U+0392 */
+ case 277666448:
+ if (len == 4 /* && strncmp(ent, "Beta") == 0 */) {
+ return 914;
+ }
+ break;
+ /* Gamma 915 greek capital letter gamma, */
+ case 1537149070:
+ if (len == 5 /* && strncmp(ent, "Gamma") == 0 */) {
+ return 915;
+ }
+ break;
+ /* Delta 916 greek capital letter delta, */
+ case 3855542753:
+ if (len == 5 /* && strncmp(ent, "Delta") == 0 */) {
+ return 916;
+ }
+ break;
+ /* Epsilon 917 greek capital letter epsilon, U+0395 */
+ case 2449300823:
+ if (len == 7 /* && strncmp(ent, "Epsilon") == 0 */) {
+ return 917;
+ }
+ break;
+ /* Zeta 918 greek capital letter zeta, U+0396 */
+ case 2137381000:
+ if (len == 4 /* && strncmp(ent, "Zeta") == 0 */) {
+ return 918;
+ }
+ break;
+ /* Eta 919 greek capital letter eta, U+0397 */
+ case 528494267:
+ if (len == 3 /* && strncmp(ent, "Eta") == 0 */) {
+ return 919;
+ }
+ break;
+ /* Theta 920 greek capital letter theta, */
+ case 3904764433:
+ if (len == 5 /* && strncmp(ent, "Theta") == 0 */) {
+ return 920;
+ }
+ break;
+ /* Iota 921 greek capital letter iota, U+0399 */
+ case 3284124477:
+ if (len == 4 /* && strncmp(ent, "Iota") == 0 */) {
+ return 921;
+ }
+ break;
+ /* Kappa 922 greek capital letter kappa, U+039A */
+ case 3346788084:
+ if (len == 5 /* && strncmp(ent, "Kappa") == 0 */) {
+ return 922;
+ }
+ break;
+ /* Lambda 923 greek capital letter lambda, */
+ case 1824315307:
+ if (len == 6 /* && strncmp(ent, "Lambda") == 0 */) {
+ return 923;
+ }
+ break;
+ /* Mu 924 greek capital letter mu, U+039C */
+ case 1324604304:
+ if (len == 2 /* && strncmp(ent, "Mu") == 0 */) {
+ return 924;
+ }
+ break;
+ /* Nu 925 greek capital letter nu, U+039D */
+ case 1326268829:
+ if (len == 2 /* && strncmp(ent, "Nu") == 0 */) {
+ return 925;
+ }
+ break;
+ /* Xi 926 greek capital letter xi, U+039E ISOgrk3 */
+ case 1342914067:
+ if (len == 2 /* && strncmp(ent, "Xi") == 0 */) {
+ return 926;
+ }
+ break;
+ /* Omicron 927 greek capital letter omicron, U+039F */
+ case 488730696:
+ if (len == 7 /* && strncmp(ent, "Omicron") == 0 */) {
+ return 927;
+ }
+ break;
+ /* Pi 928 greek capital letter pi, U+03A0 ISOgrk3 */
+ case 1329597867:
+ if (len == 2 /* && strncmp(ent, "Pi") == 0 */) {
+ return 928;
+ }
+ break;
+ /* Rho 929 greek capital letter rho, U+03A1 */
+ case 1277958850:
+ if (len == 3 /* && strncmp(ent, "Rho") == 0 */) {
+ return 929;
+ }
+ break;
+ /* Sigma 931 greek capital letter sigma, */
+ case 3159100428:
+ if (len == 5 /* && strncmp(ent, "Sigma") == 0 */) {
+ return 931;
+ }
+ break;
+ /* Tau 932 greek capital letter tau, U+03A4 */
+ case 2045446591:
+ if (len == 3 /* && strncmp(ent, "Tau") == 0 */) {
+ return 932;
+ }
+ break;
+ /* Upsilon 933 greek capital letter upsilon, */
+ case 2291992807:
+ if (len == 7 /* && strncmp(ent, "Upsilon") == 0 */) {
+ return 933;
+ }
+ break;
+ /* Phi 934 greek capital letter phi, */
+ case 498819434:
+ if (len == 3 /* && strncmp(ent, "Phi") == 0 */) {
+ return 934;
+ }
+ break;
+ /* Chi 935 greek capital letter chi, U+03A7 */
+ case 4024347861:
+ if (len == 3 /* && strncmp(ent, "Chi") == 0 */) {
+ return 935;
+ }
+ break;
+ /* Psi 936 greek capital letter psi, */
+ case 517129209:
+ if (len == 3 /* && strncmp(ent, "Psi") == 0 */) {
+ return 936;
+ }
+ break;
+ /* Omega 937 greek capital letter omega, */
+ case 612334204:
+ if (len == 5 /* && strncmp(ent, "Omega") == 0 */) {
+ return 937;
+ }
+ break;
+ /* alpha 945 greek small letter alpha, */
+ case 525212009:
+ if (len == 5 /* && strncmp(ent, "alpha") == 0 */) {
+ return 945;
+ }
+ break;
+ /* beta 946 greek small letter beta, U+03B2 ISOgrk3 */
+ case 4188941616:
+ if (len == 4 /* && strncmp(ent, "beta") == 0 */) {
+ return 946;
+ }
+ break;
+ /* gamma 947 greek small letter gamma, */
+ case 2329672366:
+ if (len == 5 /* && strncmp(ent, "gamma") == 0 */) {
+ return 947;
+ }
+ break;
+ /* delta 948 greek small letter delta, */
+ case 353098753:
+ if (len == 5 /* && strncmp(ent, "delta") == 0 */) {
+ return 948;
+ }
+ break;
+ /* epsilon 949 greek small letter epsilon, */
+ case 2134684791:
+ if (len == 7 /* && strncmp(ent, "epsilon") == 0 */) {
+ return 949;
+ }
+ break;
+ /* zeta 950 greek small letter zeta, U+03B6 ISOgrk3 */
+ case 1753688872:
+ if (len == 4 /* && strncmp(ent, "zeta") == 0 */) {
+ return 950;
+ }
+ break;
+ /* eta 951 greek small letter eta, U+03B7 ISOgrk3 */
+ case 109822939:
+ if (len == 3 /* && strncmp(ent, "eta") == 0 */) {
+ return 951;
+ }
+ break;
+ /* theta 952 greek small letter theta, */
+ case 402320433:
+ if (len == 5 /* && strncmp(ent, "theta") == 0 */) {
+ return 952;
+ }
+ break;
+ /* iota 953 greek small letter iota, U+03B9 ISOgrk3 */
+ case 2900432349:
+ if (len == 4 /* && strncmp(ent, "iota") == 0 */) {
+ return 953;
+ }
+ break;
+ /* kappa 954 greek small letter kappa, */
+ case 4139311380:
+ if (len == 5 /* && strncmp(ent, "kappa") == 0 */) {
+ return 954;
+ }
+ break;
+ /* lambda 955 greek small letter lambda, */
+ case 3228427083:
+ if (len == 6 /* && strncmp(ent, "lambda") == 0 */) {
+ return 955;
+ }
+ break;
+ /* mu 956 greek small letter mu, U+03BC ISOgrk3 */
+ case 1377869104:
+ if (len == 2 /* && strncmp(ent, "mu") == 0 */) {
+ return 956;
+ }
+ break;
+ /* nu 957 greek small letter nu, U+03BD ISOgrk3 */
+ case 1379533629:
+ if (len == 2 /* && strncmp(ent, "nu") == 0 */) {
+ return 957;
+ }
+ break;
+ /* xi 958 greek small letter xi, U+03BE ISOgrk3 */
+ case 1396178867:
+ if (len == 2 /* && strncmp(ent, "xi") == 0 */) {
+ return 958;
+ }
+ break;
+ /* omicron 959 greek small letter omicron, U+03BF NEW */
+ case 174114664:
+ if (len == 7 /* && strncmp(ent, "omicron") == 0 */) {
+ return 959;
+ }
+ break;
+ /* pi 960 greek small letter pi, U+03C0 ISOgrk3 */
+ case 1382862667:
+ if (len == 2 /* && strncmp(ent, "pi") == 0 */) {
+ return 960;
+ }
+ break;
+ /* rho 961 greek small letter rho, U+03C1 ISOgrk3 */
+ case 859287522:
+ if (len == 3 /* && strncmp(ent, "rho") == 0 */) {
+ return 961;
+ }
+ break;
+ /* sigmaf 962 greek small letter final sigma, */
+ case 2582995969:
+ if (len == 6 /* && strncmp(ent, "sigmaf") == 0 */) {
+ return 962;
+ }
+ break;
+ /* sigma 963 greek small letter sigma, */
+ case 3951623724:
+ if (len == 5 /* && strncmp(ent, "sigma") == 0 */) {
+ return 963;
+ }
+ break;
+ /* tau 964 greek small letter tau, U+03C4 ISOgrk3 */
+ case 1626775263:
+ if (len == 3 /* && strncmp(ent, "tau") == 0 */) {
+ return 964;
+ }
+ break;
+ /* upsilon 965 greek small letter upsilon, */
+ case 1977376775:
+ if (len == 7 /* && strncmp(ent, "upsilon") == 0 */) {
+ return 965;
+ }
+ break;
+ /* phi 966 greek small letter phi, U+03C6 ISOgrk3 */
+ case 80148106:
+ if (len == 3 /* && strncmp(ent, "phi") == 0 */) {
+ return 966;
+ }
+ break;
+ /* chi 967 greek small letter chi, U+03C7 ISOgrk3 */
+ case 3605676533:
+ if (len == 3 /* && strncmp(ent, "chi") == 0 */) {
+ return 967;
+ }
+ break;
+ /* psi 968 greek small letter psi, U+03C8 ISOgrk3 */
+ case 98457881:
+ if (len == 3 /* && strncmp(ent, "psi") == 0 */) {
+ return 968;
+ }
+ break;
+ /* omega 969 greek small letter omega, */
+ case 1404857500:
+ if (len == 5 /* && strncmp(ent, "omega") == 0 */) {
+ return 969;
+ }
+ break;
+ /* thetasym 977 greek small letter theta symbol, */
+ case 3881711083:
+ if (len == 8 /* && strncmp(ent, "thetasym") == 0 */) {
+ return 977;
+ }
+ break;
+ /* upsih 978 greek upsilon with hook symbol, */
+ case 3753563936:
+ if (len == 5 /* && strncmp(ent, "upsih") == 0 */) {
+ return 978;
+ }
+ break;
+ /* piv 982 greek pi symbol, U+03D6 ISOgrk3 */
+ case 81812644:
+ if (len == 3 /* && strncmp(ent, "piv") == 0 */) {
+ return 982;
+ }
+ break;
+ /* bull 8226 bullet = black small circle, */
+ case 1818806115:
+ if (len == 4 /* && strncmp(ent, "bull") == 0 */) {
+ return 8226;
+ }
+ break;
+ /* hellip 8230 horizontal ellipsis = three dot leader, */
+ case 1967714928:
+ if (len == 6 /* && strncmp(ent, "hellip") == 0 */) {
+ return 8230;
+ }
+ break;
+ /* prime 8242 prime = minutes = feet, U+2032 ISOtech */
+ case 656236556:
+ if (len == 5 /* && strncmp(ent, "prime") == 0 */) {
+ return 8242;
+ }
+ break;
+ /* Prime 8243 double prime = seconds = inches, */
+ case 4158680556:
+ if (len == 5 /* && strncmp(ent, "Prime") == 0 */) {
+ return 8243;
+ }
+ break;
+ /* oline 8254 overline = spacing overscore, */
+ case 33988362:
+ if (len == 5 /* && strncmp(ent, "oline") == 0 */) {
+ return 8254;
+ }
+ break;
+ /* frasl 8260 fraction slash, U+2044 NEW */
+ case 254792559:
+ if (len == 5 /* && strncmp(ent, "frasl") == 0 */) {
+ return 8260;
+ }
+ break;
+ /* weierp 8472 script capital P = power set */
+ case 3305299450:
+ if (len == 6 /* && strncmp(ent, "weierp") == 0 */) {
+ return 8472;
+ }
+ break;
+ /* image 8465 blackletter capital I = imaginary part, */
+ case 3187641494:
+ if (len == 5 /* && strncmp(ent, "image") == 0 */) {
+ return 8465;
+ }
+ break;
+ /* real 8476 blackletter capital R = real part symbol, */
+ case 3965469588:
+ if (len == 4 /* && strncmp(ent, "real") == 0 */) {
+ return 8476;
+ }
+ break;
+ /* trade 8482 trade mark sign, U+2122 ISOnum */
+ case 2455601811:
+ if (len == 5 /* && strncmp(ent, "trade") == 0 */) {
+ return 8482;
+ }
+ break;
+ /* alefsym 8501 alef symbol = first transfinite cardinal, */
+ case 3894502290:
+ if (len == 7 /* && strncmp(ent, "alefsym") == 0 */) {
+ return 8501;
+ }
+ break;
+ /* larr 8592 leftwards arrow, U+2190 ISOnum */
+ case 1970559061:
+ if (len == 4 /* && strncmp(ent, "larr") == 0 */) {
+ return 8592;
+ }
+ break;
+ /* uarr 8593 upwards arrow, U+2191 ISOnum */
+ case 2667952018:
+ if (len == 4 /* && strncmp(ent, "uarr") == 0 */) {
+ return 8593;
+ }
+ break;
+ /* rarr 8594 rightwards arrow, U+2192 ISOnum */
+ case 2435487699:
+ if (len == 4 /* && strncmp(ent, "rarr") == 0 */) {
+ return 8594;
+ }
+ break;
+ /* darr 8595 downwards arrow, U+2193 ISOnum */
+ case 4213965741:
+ if (len == 4 /* && strncmp(ent, "darr") == 0 */) {
+ return 8595;
+ }
+ break;
+ /* harr 8596 left right arrow, U+2194 ISOamsa */
+ case 3092262401:
+ if (len == 4 /* && strncmp(ent, "harr") == 0 */) {
+ return 8596;
+ }
+ break;
+ /* crarr 8629 downwards arrow with corner leftwards */
+ case 4071143093:
+ if (len == 5 /* && strncmp(ent, "crarr") == 0 */) {
+ return 8629;
+ }
+ break;
+ /* lArr 8656 leftwards double arrow, U+21D0 ISOtech */
+ case 2389230389:
+ if (len == 4 /* && strncmp(ent, "lArr") == 0 */) {
+ return 8656;
+ }
+ break;
+ /* uArr 8657 upwards double arrow, U+21D1 ISOamsa */
+ case 3086623346:
+ if (len == 4 /* && strncmp(ent, "uArr") == 0 */) {
+ return 8657;
+ }
+ break;
+ /* rArr 8658 rightwards double arrow, */
+ case 2854159027:
+ if (len == 4 /* && strncmp(ent, "rArr") == 0 */) {
+ return 8658;
+ }
+ break;
+ /* dArr 8659 downwards double arrow, U+21D3 ISOamsa */
+ case 337669773:
+ if (len == 4 /* && strncmp(ent, "dArr") == 0 */) {
+ return 8659;
+ }
+ break;
+ /* hArr 8660 left right double arrow, */
+ case 3510933729:
+ if (len == 4 /* && strncmp(ent, "hArr") == 0 */) {
+ return 8660;
+ }
+ break;
+ /* forall 8704 for all, U+2200 ISOtech */
+ case 2607244222:
+ if (len == 6 /* && strncmp(ent, "forall") == 0 */) {
+ return 8704;
+ }
+ break;
+ /* part 8706 partial differential, U+2202 ISOtech */
+ case 848855723:
+ if (len == 4 /* && strncmp(ent, "part") == 0 */) {
+ return 8706;
+ }
+ break;
+ /* exist 8707 there exists, U+2203 ISOtech */
+ case 3677294764:
+ if (len == 5 /* && strncmp(ent, "exist") == 0 */) {
+ return 8707;
+ }
+ break;
+ /* empty 8709 empty set = null set = diameter, */
+ case 4121922294:
+ if (len == 5 /* && strncmp(ent, "empty") == 0 */) {
+ return 8709;
+ }
+ break;
+ /* nabla 8711 nabla = backward difference, */
+ case 3450596949:
+ if (len == 5 /* && strncmp(ent, "nabla") == 0 */) {
+ return 8711;
+ }
+ break;
+ /* isin 8712 element of, U+2208 ISOtech */
+ case 145434111:
+ if (len == 4 /* && strncmp(ent, "isin") == 0 */) {
+ return 8712;
+ }
+ break;
+ /* notin 8713 not an element of, U+2209 ISOtech */
+ case 89445443:
+ if (len == 5 /* && strncmp(ent, "notin") == 0 */) {
+ return 8713;
+ }
+ break;
+ /* ni 8715 contains as member, U+220B ISOtech */
+ case 1379533617:
+ if (len == 2 /* && strncmp(ent, "ni") == 0 */) {
+ return 8715;
+ }
+ break;
+ /* prod 8719 n-ary product = product sign, */
+ case 3171579821:
+ if (len == 4 /* && strncmp(ent, "prod") == 0 */) {
+ return 8719;
+ }
+ break;
+ /* sum 8721 n-ary sumation, U+2211 ISOamsb */
+ case 1270496050:
+ if (len == 3 /* && strncmp(ent, "sum") == 0 */) {
+ return 8721;
+ }
+ break;
+ /* minus 8722 minus sign, U+2212 ISOtech */
+ case 1443056095:
+ if (len == 5 /* && strncmp(ent, "minus") == 0 */) {
+ return 8722;
+ }
+ break;
+ /* lowast 8727 asterisk operator, U+2217 ISOtech */
+ case 137860408:
+ if (len == 6 /* && strncmp(ent, "lowast") == 0 */) {
+ return 8727;
+ }
+ break;
+ /* radic 8730 square root = radical sign, */
+ case 565711814:
+ if (len == 5 /* && strncmp(ent, "radic") == 0 */) {
+ return 8730;
+ }
+ break;
+ /* prop 8733 proportional to, U+221D ISOtech */
+ case 3171579833:
+ if (len == 4 /* && strncmp(ent, "prop") == 0 */) {
+ return 8733;
+ }
+ break;
+ /* infin 8734 infinity, U+221E ISOtech */
+ case 3784651419:
+ if (len == 5 /* && strncmp(ent, "infin") == 0 */) {
+ return 8734;
+ }
+ break;
+ /* ang 8736 angle, U+2220 ISOamso */
+ case 2836524271:
+ if (len == 3 /* && strncmp(ent, "ang") == 0 */) {
+ return 8736;
+ }
+ break;
+ /* and 8743 logical and = wedge, U+2227 ISOtech */
+ case 2836524268:
+ if (len == 3 /* && strncmp(ent, "and") == 0 */) {
+ return 8743;
+ }
+ break;
+ /* or 8744 logical or = vee, U+2228 ISOtech */
+ case 1381198151:
+ if (len == 2 /* && strncmp(ent, "or") == 0 */) {
+ return 8744;
+ }
+ break;
+ /* cap 8745 intersection = cap, U+2229 ISOtech */
+ case 3594024865:
+ if (len == 3 /* && strncmp(ent, "cap") == 0 */) {
+ return 8745;
+ }
+ break;
+ /* cup 8746 union = cup, U+222A ISOtech */
+ case 3627315365:
+ if (len == 3 /* && strncmp(ent, "cup") == 0 */) {
+ return 8746;
+ }
+ break;
+ /* int 8747 integral, U+222B ISOtech */
+ case 1658114628:
+ if (len == 3 /* && strncmp(ent, "int") == 0 */) {
+ return 8747;
+ }
+ break;
+ /* there4 8756 therefore, U+2234 ISOtech */
+ case 1359369970:
+ if (len == 6 /* && strncmp(ent, "there4") == 0 */) {
+ return 8756;
+ }
+ break;
+ /* sim 8764 tilde operator = varies with = similar to, */
+ case 1250521750:
+ if (len == 3 /* && strncmp(ent, "sim") == 0 */) {
+ return 8764;
+ }
+ break;
+ /* cong 8773 approximately equal to, U+2245 ISOtech */
+ case 2425516567:
+ if (len == 4 /* && strncmp(ent, "cong") == 0 */) {
+ return 8773;
+ }
+ break;
+ /* asymp 8776 almost equal to = asymptotic to, */
+ case 3150422973:
+ if (len == 5 /* && strncmp(ent, "asymp") == 0 */) {
+ return 8776;
+ }
+ break;
+ /* ne 8800 not equal to, U+2260 ISOtech */
+ case 1379533613:
+ if (len == 2 /* && strncmp(ent, "ne") == 0 */) {
+ return 8800;
+ }
+ break;
+ /* equiv 8801 identical to, U+2261 ISOtech */
+ case 634790405:
+ if (len == 5 /* && strncmp(ent, "equiv") == 0 */) {
+ return 8801;
+ }
+ break;
+ /* le 8804 less-than or equal to, U+2264 ISOtech */
+ case 1376204563:
+ if (len == 2 /* && strncmp(ent, "le") == 0 */) {
+ return 8804;
+ }
+ break;
+ /* ge 8805 greater-than or equal to, */
+ case 1367881938:
+ if (len == 2 /* && strncmp(ent, "ge") == 0 */) {
+ return 8805;
+ }
+ break;
+ /* sub 8834 subset of, U+2282 ISOtech */
+ case 1270496039:
+ if (len == 3 /* && strncmp(ent, "sub") == 0 */) {
+ return 8834;
+ }
+ break;
+ /* sup 8835 superset of, U+2283 ISOtech */
+ case 1270496053:
+ if (len == 3 /* && strncmp(ent, "sup") == 0 */) {
+ return 8835;
+ }
+ break;
+ /* nsub 8836 not a subset of, U+2284 ISOamsn */
+ case 1984504696:
+ if (len == 4 /* && strncmp(ent, "nsub") == 0 */) {
+ return 8836;
+ }
+ break;
+ /* sube 8838 subset of or equal to, U+2286 ISOtech */
+ case 256147135:
+ if (len == 4 /* && strncmp(ent, "sube") == 0 */) {
+ return 8838;
+ }
+ break;
+ /* supe 8839 superset of or equal to, */
+ case 279450485:
+ if (len == 4 /* && strncmp(ent, "supe") == 0 */) {
+ return 8839;
+ }
+ break;
+ /* oplus 8853 circled plus = direct sum, */
+ case 92645826:
+ if (len == 5 /* && strncmp(ent, "oplus") == 0 */) {
+ return 8853;
+ }
+ break;
+ /* otimes 8855 circled times = vector product, */
+ case 3065242419:
+ if (len == 6 /* && strncmp(ent, "otimes") == 0 */) {
+ return 8855;
+ }
+ break;
+ /* perp 8869 up tack = orthogonal to = perpendicular, */
+ case 2407134539:
+ if (len == 4 /* && strncmp(ent, "perp") == 0 */) {
+ return 8869;
+ }
+ break;
+ /* sdot 8901 dot operator, U+22C5 ISOamsb */
+ case 2245035582:
+ if (len == 4 /* && strncmp(ent, "sdot") == 0 */) {
+ return 8901;
+ }
+ break;
+ /* lceil 8968 left ceiling = apl upstile, */
+ case 1588009020:
+ if (len == 5 /* && strncmp(ent, "lceil") == 0 */) {
+ return 8968;
+ }
+ break;
+ /* rceil 8969 right ceiling, U+2309 ISOamsc */
+ case 2541913506:
+ if (len == 5 /* && strncmp(ent, "rceil") == 0 */) {
+ return 8969;
+ }
+ break;
+ /* lfloor 8970 left floor = apl downstile, */
+ case 1870296512:
+ if (len == 6 /* && strncmp(ent, "lfloor") == 0 */) {
+ return 8970;
+ }
+ break;
+ /* rfloor 8971 right floor, U+230B ISOamsc */
+ case 1865132014:
+ if (len == 6 /* && strncmp(ent, "rfloor") == 0 */) {
+ return 8971;
+ }
+ break;
+ /* lang 9001 left-pointing angle bracket = bra, */
+ case 1963900950:
+ if (len == 4 /* && strncmp(ent, "lang") == 0 */) {
+ return 9001;
+ }
+ break;
+ /* rang 9002 right-pointing angle bracket = ket, */
+ case 2428829588:
+ if (len == 4 /* && strncmp(ent, "rang") == 0 */) {
+ return 9002;
+ }
+ break;
+ /* loz 9674 lozenge, U+25CA ISOpub */
+ case 2828488274:
+ if (len == 3 /* && strncmp(ent, "loz") == 0 */) {
+ return 9674;
+ }
+ break;
+ /* spades 9824 black spade suit, U+2660 ISOpub */
+ case 4026453962:
+ if (len == 6 /* && strncmp(ent, "spades") == 0 */) {
+ return 9824;
+ }
+ break;
+ /* clubs 9827 black club suit = shamrock, */
+ case 2781041564:
+ if (len == 5 /* && strncmp(ent, "clubs") == 0 */) {
+ return 9827;
+ }
+ break;
+ /* hearts 9829 black heart suit = valentine, */
+ case 2039418001:
+ if (len == 6 /* && strncmp(ent, "hearts") == 0 */) {
+ return 9829;
+ }
+ break;
+ /* diams 9830 black diamond suit, U+2666 ISOpub */
+ case 3524411593:
+ if (len == 5 /* && strncmp(ent, "diams") == 0 */) {
+ return 9830;
+ }
+ break;
+ /* quot 34 quotation mark = APL quote, */
+ case 2986121293:
+ if (len == 4 /* && strncmp(ent, "quot") == 0 */) {
+ return 34;
+ }
+ break;
+ /* amp 38 ampersand, U+0026 ISOnum */
+ case 2834859755:
+ if (len == 3 /* && strncmp(ent, "amp") == 0 */) {
+ return 38;
+ }
+ break;
+ /* lt 60 less-than sign, U+003C ISOnum */
+ case 1376204578:
+ if (len == 2 /* && strncmp(ent, "lt") == 0 */) {
+ return 60;
+ }
+ break;
+ /* gt 62 greater-than sign, U+003E ISOnum */
+ case 1367881953:
+ if (len == 2 /* && strncmp(ent, "gt") == 0 */) {
+ return 62;
+ }
+ break;
+ /* OElig 338 latin capital ligature OE, */
+ case 1674782707:
+ if (len == 5 /* && strncmp(ent, "OElig") == 0 */) {
+ return 338;
+ }
+ break;
+ /* oelig 339 latin small ligature oe, U+0153 ISOlat2 */
+ case 2083613875:
+ if (len == 5 /* && strncmp(ent, "oelig") == 0 */) {
+ return 339;
+ }
+ break;
+ /* Scaron 352 latin capital letter S with caron, */
+ case 1731202952:
+ if (len == 6 /* && strncmp(ent, "Scaron") == 0 */) {
+ return 352;
+ }
+ break;
+ /* scaron 353 latin small letter s with caron, */
+ case 3135314728:
+ if (len == 6 /* && strncmp(ent, "scaron") == 0 */) {
+ return 353;
+ }
+ break;
+ /* Yuml 376 latin capital letter Y with diaeresis, */
+ case 1123077683:
+ if (len == 4 /* && strncmp(ent, "Yuml") == 0 */) {
+ return 376;
+ }
+ break;
+ /* circ 710 modifier letter circumflex accent, */
+ case 94756433:
+ if (len == 4 /* && strncmp(ent, "circ") == 0 */) {
+ return 710;
+ }
+ break;
+ /* tilde 732 small tilde, U+02DC ISOdia */
+ case 1748508313:
+ if (len == 5 /* && strncmp(ent, "tilde") == 0 */) {
+ return 732;
+ }
+ break;
+ /* ensp 8194 en space, U+2002 ISOpub */
+ case 3630901474:
+ if (len == 4 /* && strncmp(ent, "ensp") == 0 */) {
+ return 8194;
+ }
+ break;
+ /* emsp 8195 em space, U+2003 ISOpub */
+ case 3241331769:
+ if (len == 4 /* && strncmp(ent, "emsp") == 0 */) {
+ return 8195;
+ }
+ break;
+ /* thinsp 8201 thin space, U+2009 ISOpub */
+ case 2997658516:
+ if (len == 6 /* && strncmp(ent, "thinsp") == 0 */) {
+ return 8201;
+ }
+ break;
+ /* zwnj 8204 zero width non-joiner, */
+ case 166021829:
+ if (len == 4 /* && strncmp(ent, "zwnj") == 0 */) {
+ return 8204;
+ }
+ break;
+ /* zwj 8205 zero width joiner, U+200D NEW RFC 2070 */
+ case 4000813032:
+ if (len == 3 /* && strncmp(ent, "zwj") == 0 */) {
+ return 8205;
+ }
+ break;
+ /* lrm 8206 left-to-right mark, U+200E NEW RFC 2070 */
+ case 2833481836:
+ if (len == 3 /* && strncmp(ent, "lrm") == 0 */) {
+ return 8206;
+ }
+ break;
+ /* rlm 8207 right-to-left mark, U+200F NEW RFC 2070 */
+ case 865945620:
+ if (len == 3 /* && strncmp(ent, "rlm") == 0 */) {
+ return 8207;
+ }
+ break;
+ /* ndash 8211 en dash, U+2013 ISOpub */
+ case 3305143245:
+ if (len == 5 /* && strncmp(ent, "ndash") == 0 */) {
+ return 8211;
+ }
+ break;
+ /* mdash 8212 em dash, U+2014 ISOpub */
+ case 3146159164:
+ if (len == 5 /* && strncmp(ent, "mdash") == 0 */) {
+ return 8212;
+ }
+ break;
+ /* lsquo 8216 left single quotation mark, */
+ case 1796006423:
+ if (len == 5 /* && strncmp(ent, "lsquo") == 0 */) {
+ return 8216;
+ }
+ break;
+ /* rsquo 8217 right single quotation mark, */
+ case 2749910909:
+ if (len == 5 /* && strncmp(ent, "rsquo") == 0 */) {
+ return 8217;
+ }
+ break;
+ /* sbquo 8218 single low-9 quotation mark, U+201A NEW */
+ case 159941417:
+ if (len == 5 /* && strncmp(ent, "sbquo") == 0 */) {
+ return 8218;
+ }
+ break;
+ /* ldquo 8220 left double quotation mark, */
+ case 633684828:
+ if (len == 5 /* && strncmp(ent, "ldquo") == 0 */) {
+ return 8220;
+ }
+ break;
+ /* rdquo 8221 right double quotation mark, */
+ case 1587589314:
+ if (len == 5 /* && strncmp(ent, "rdquo") == 0 */) {
+ return 8221;
+ }
+ break;
+ /* bdquo 8222 double low-9 quotation mark, U+201E NEW */
+ case 3338811314:
+ if (len == 5 /* && strncmp(ent, "bdquo") == 0 */) {
+ return 8222;
+ }
+ break;
+ /* dagger 8224 dagger, U+2020 ISOpub */
+ case 3288241744:
+ if (len == 6 /* && strncmp(ent, "dagger") == 0 */) {
+ return 8224;
+ }
+ break;
+ /* Dagger 8225 double dagger, U+2021 ISOpub */
+ case 1884129968:
+ if (len == 6 /* && strncmp(ent, "Dagger") == 0 */) {
+ return 8225;
+ }
+ break;
+ /* permil 8240 per mille sign, U+2030 ISOtech */
+ case 4246983035:
+ if (len == 6 /* && strncmp(ent, "permil") == 0 */) {
+ return 8240;
+ }
+ break;
+ /* lsaquo 8249 single left-pointing angle quotation mark, */
+ case 2442191187:
+ if (len == 6 /* && strncmp(ent, "lsaquo") == 0 */) {
+ return 8249;
+ }
+ break;
+ /* rsaquo 8250 single right-pointing angle quotation mark, */
+ case 2437026689:
+ if (len == 6 /* && strncmp(ent, "rsaquo") == 0 */) {
+ return 8250;
+ }
+ break;
+ /* euro 8364 euro sign, U+20AC NEW */
+ case 2061257587:
+ if (len == 4 /* && strncmp(ent, "euro") == 0 */) {
+ return 8364;
+ }
+ break;
+ }
+ /* unknown */
+ return -1;
+}
diff --git a/src/htsentities.sh b/src/htsentities.sh
new file mode 100755
index 0000000..5039aee
--- /dev/null
+++ b/src/htsentities.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#
+
+src=html40.txt
+url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt
+dest=htsentities.h
+
+(
+ cat <<EOF
+/*
+ -- ${dest} --
+ FILE GENERATED BY $0, DO NOT MODIFY
+
+ We compute the LCG hash
+ (see <http://en.wikipedia.org/wiki/Linear_congruential_generator>)
+ for each entity. We should in theory check using strncmp() that we
+ actually have the correct entity, but this is actually statistically
+ not needed.
+
+ We may want to do better, but we expect the hash function to be uniform, and
+ let the compiler be smart enough to optimize the switch (for example by
+ checking in log2() intervals)
+
+ This code has been generated using the evil $0 script.
+*/
+
+static int decode_entity(const unsigned int hash, const size_t len) {
+ switch(hash) {
+EOF
+ (
+ if test -f ${src}; then
+ cat ${src}
+ else
+ GET "${url}"
+ fi
+ ) \
+ | grep -E '^<!ENTITY [a-zA-Z0-9_]' \
+ | sed \
+ -e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
+ -e 's/-->$//' \
+ -e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/'\
+| ( \
+ read A
+ while test -n "$A"; do
+ ent="${A%% *}"
+ code=$(echo "$A"|cut -f2 -d' ')
+ # compute hash
+ hash=0
+ i=0
+ a=1664525
+ c=1013904223
+ m="$[1 << 32]"
+ while test "$i" -lt ${#ent}; do
+ d="$(echo -n "${ent:${i}:1}"|hexdump -v -e '/1 "%d"')"
+ hash="$[((${hash}*${a})%(${m})+${d}+${c})%(${m})]"
+ i=$[${i}+1]
+ done
+ echo -e " /* $A */"
+ echo -e " case ${hash}:"
+ echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
+ echo -e " return ${code};"
+ echo -e " }"
+ echo -e " break;"
+
+ # next
+ read A
+ done
+ )
+ cat <<EOF
+ }
+ /* unknown */
+ return -1;
+}
+EOF
+) > ${dest}
diff --git a/src/htslib.c b/src/htslib.c
index fe9f240..8b53b88 100644
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -54,6 +54,7 @@ Please visit our Website: http://www.httrack.com
#include "htsmd5.h"
#include "htsmodules.h"
#include "htscharset.h"
+#include "htsencoding.h"
#ifdef _WIN32
#ifndef _WIN32_WCE
@@ -3737,255 +3738,6 @@ void code64(unsigned char *a, int size_a, unsigned char *b, int crlf) {
*b++ = '\0';
}
-// remplacer &quot; par " etc..
-// buffer MAX 1Ko
-#define strcmpbeg(a, b) strncmp(a, b, strlen(b))
-HTSEXT_API void unescape_amp(char *s) {
- while(*s) {
- if (*s == '&') {
- char *end = strchr(s, ';');
-
- if (end && (((int) (end - s)) <= 8)) {
- unsigned char c = 0;
-
- // http://www.w3.org/TR/xhtml-modularization/dtd_module_defs.html
- if (strcmpbeg(s, "&#") == 0) {
- int num = 0;
-
- if ((s[2] == 'x') || (s[2] == 'X')) {
- if (sscanf(s + 3, "%x", &num) == 1 && num <= 0xff) {
- c = (unsigned char) num;
- }
- } else {
- if (sscanf(s + 2, "%d", &num) == 1 && num <= 0xff) {
- c = (unsigned char) num;
- }
- }
- } else if (strcmpbeg(s, "&nbsp;") == 0)
- c = 32; // hack - c=160;
- else if (strcmpbeg(s, "&iexcl;") == 0)
- c = 161;
- else if (strcmpbeg(s, "&cent;") == 0)
- c = 162;
- else if (strcmpbeg(s, "&pound;") == 0)
- c = 163;
- else if (strcmpbeg(s, "&curren;") == 0)
- c = 164;
- else if (strcmpbeg(s, "&yen;") == 0)
- c = 165;
- else if (strcmpbeg(s, "&brvbar;") == 0)
- c = 166;
- else if (strcmpbeg(s, "&sect;") == 0)
- c = 167;
- else if (strcmpbeg(s, "&uml;") == 0)
- c = 168;
- else if (strcmpbeg(s, "&copy;") == 0)
- c = 169;
- else if (strcmpbeg(s, "&ordf;") == 0)
- c = 170;
- //else if (strcmpbeg(s, "&laquo;")==0)
- // c=171;
- else if (strcmpbeg(s, "&not;") == 0)
- c = 172;
- //else if (strcmpbeg(s, "&shy;")==0)
- // c=173;
- else if (strcmpbeg(s, "&reg;") == 0)
- c = 174;
- else if (strcmpbeg(s, "&macr;") == 0)
- c = 175;
- else if (strcmpbeg(s, "&deg;") == 0)
- c = 176;
- else if (strcmpbeg(s, "&plusmn;") == 0)
- c = 177;
- else if (strcmpbeg(s, "&sup2;") == 0)
- c = 178;
- else if (strcmpbeg(s, "&sup3;") == 0)
- c = 179;
- else if (strcmpbeg(s, "&acute;") == 0)
- c = 180;
- else if (strcmpbeg(s, "&micro;") == 0)
- c = 181;
- else if (strcmpbeg(s, "&para;") == 0)
- c = 182;
- else if (strcmpbeg(s, "&middot;") == 0)
- c = 183;
- else if (strcmpbeg(s, "&cedil;") == 0)
- c = 184;
- else if (strcmpbeg(s, "&sup1;") == 0)
- c = 185;
- else if (strcmpbeg(s, "&ordm;") == 0)
- c = 186;
- //else if (strcmpbeg(s, "&raquo;")==0)
- // c=187;
- else if (strcmpbeg(s, "&frac14;") == 0)
- c = 188;
- else if (strcmpbeg(s, "&frac12;") == 0)
- c = 189;
- else if (strcmpbeg(s, "&frac34;") == 0)
- c = 190;
- else if (strcmpbeg(s, "&iquest;") == 0)
- c = 191;
- else if (strcmpbeg(s, "&Agrave;") == 0)
- c = 192;
- else if (strcmpbeg(s, "&Aacute;") == 0)
- c = 193;
- else if (strcmpbeg(s, "&Acirc;") == 0)
- c = 194;
- else if (strcmpbeg(s, "&Atilde;") == 0)
- c = 195;
- else if (strcmpbeg(s, "&Auml;") == 0)
- c = 196;
- else if (strcmpbeg(s, "&Aring;") == 0)
- c = 197;
- else if (strcmpbeg(s, "&AElig;") == 0)
- c = 198;
- else if (strcmpbeg(s, "&Ccedil;") == 0)
- c = 199;
- else if (strcmpbeg(s, "&Egrave;") == 0)
- c = 200;
- else if (strcmpbeg(s, "&Eacute;") == 0)
- c = 201;
- else if (strcmpbeg(s, "&Ecirc;") == 0)
- c = 202;
- else if (strcmpbeg(s, "&Euml;") == 0)
- c = 203;
- else if (strcmpbeg(s, "&Igrave;") == 0)
- c = 204;
- else if (strcmpbeg(s, "&Iacute;") == 0)
- c = 205;
- else if (strcmpbeg(s, "&Icirc;") == 0)
- c = 206;
- else if (strcmpbeg(s, "&Iuml;") == 0)
- c = 207;
- else if (strcmpbeg(s, "&ETH;") == 0)
- c = 208;
- else if (strcmpbeg(s, "&Ntilde;") == 0)
- c = 209;
- else if (strcmpbeg(s, "&Ograve;") == 0)
- c = 210;
- else if (strcmpbeg(s, "&Oacute;") == 0)
- c = 211;
- else if (strcmpbeg(s, "&Ocirc;") == 0)
- c = 212;
- else if (strcmpbeg(s, "&Otilde;") == 0)
- c = 213;
- else if (strcmpbeg(s, "&Ouml;") == 0)
- c = 214;
- else if (strcmpbeg(s, "&times;") == 0)
- c = 215;
- else if (strcmpbeg(s, "&Oslash;") == 0)
- c = 216;
- else if (strcmpbeg(s, "&Ugrave;") == 0)
- c = 217;
- else if (strcmpbeg(s, "&Uacute;") == 0)
- c = 218;
- else if (strcmpbeg(s, "&Ucirc;") == 0)
- c = 219;
- else if (strcmpbeg(s, "&Uuml;") == 0)
- c = 220;
- else if (strcmpbeg(s, "&Yacute;") == 0)
- c = 221;
- else if (strcmpbeg(s, "&THORN;") == 0)
- c = 222;
- else if (strcmpbeg(s, "&szlig;") == 0)
- c = 223;
- else if (strcmpbeg(s, "&agrave;") == 0)
- c = 224;
- else if (strcmpbeg(s, "&aacute;") == 0)
- c = 225;
- else if (strcmpbeg(s, "&acirc;") == 0)
- c = 226;
- else if (strcmpbeg(s, "&atilde;") == 0)
- c = 227;
- else if (strcmpbeg(s, "&auml;") == 0)
- c = 228;
- else if (strcmpbeg(s, "&aring;") == 0)
- c = 229;
- else if (strcmpbeg(s, "&aelig;") == 0)
- c = 230;
- else if (strcmpbeg(s, "&ccedil;") == 0)
- c = 231;
- else if (strcmpbeg(s, "&egrave;") == 0)
- c = 232;
- else if (strcmpbeg(s, "&eacute;") == 0)
- c = 233;
- else if (strcmpbeg(s, "&ecirc;") == 0)
- c = 234;
- else if (strcmpbeg(s, "&euml;") == 0)
- c = 235;
- else if (strcmpbeg(s, "&igrave;") == 0)
- c = 236;
- else if (strcmpbeg(s, "&iacute;") == 0)
- c = 237;
- else if (strcmpbeg(s, "&icirc;") == 0)
- c = 238;
- else if (strcmpbeg(s, "&iuml;") == 0)
- c = 239;
- else if (strcmpbeg(s, "&eth;") == 0)
- c = 240;
- else if (strcmpbeg(s, "&ntilde;") == 0)
- c = 241;
- else if (strcmpbeg(s, "&ograve;") == 0)
- c = 242;
- else if (strcmpbeg(s, "&oacute;") == 0)
- c = 243;
- else if (strcmpbeg(s, "&ocirc;") == 0)
- c = 244;
- else if (strcmpbeg(s, "&otilde;") == 0)
- c = 245;
- else if (strcmpbeg(s, "&ouml;") == 0)
- c = 246;
- else if (strcmpbeg(s, "&divide;") == 0)
- c = 247;
- else if (strcmpbeg(s, "&oslash;") == 0)
- c = 248;
- else if (strcmpbeg(s, "&ugrave;") == 0)
- c = 249;
- else if (strcmpbeg(s, "&uacute;") == 0)
- c = 250;
- else if (strcmpbeg(s, "&ucirc;") == 0)
- c = 251;
- else if (strcmpbeg(s, "&uuml;") == 0)
- c = 252;
- else if (strcmpbeg(s, "&yacute;") == 0)
- c = 253;
- else if (strcmpbeg(s, "&thorn;") == 0)
- c = 254;
- else if (strcmpbeg(s, "&yuml;") == 0)
- c = 255;
- //
- else if (strcmpbeg(s, "&amp;") == 0)
- c = '&';
- else if (strcmpbeg(s, "&gt;") == 0)
- c = '>';
- else if (strcmpbeg(s, "&laquo;") == 0)
- c = '\"';
- else if (strcmpbeg(s, "&lt;") == 0)
- c = '<';
- else if (strcmpbeg(s, "&nbsp;") == 0)
- c = ' ';
- else if (strcmpbeg(s, "&quot;") == 0)
- c = '\"';
- else if (strcmpbeg(s, "&raquo;") == 0)
- c = '\"';
- else if (strcmpbeg(s, "&shy;") == 0)
- c = '-';
- else if (strcmpbeg(s, "&tilde;") == 0)
- c = '~';
- // remplacer?
- if (c) {
- char BIGSTK buff[HTS_URLMAXSIZE * 2];
-
- buff[0] = (char) c;
- strcpybuff(buff + 1, end + 1);
- strcpybuff(s, buff);
- }
- }
- }
- s++;
- }
-}
-
static int ehexh(char c) {
if ((c >= '0') && (c <= '9'))
return c - '0';
@@ -4000,6 +3752,12 @@ static int ehex(const char *s) {
return 16 * ehexh(*s) + ehexh(*(s + 1));
}
+void unescape_amp(char *s) {
+ if (hts_unescape_entities(s, s, strlen(s) + 1) != 0) {
+ assertf(! "error escaping html entities");
+ }
+}
+
// remplacer %20 par ' ', | par : etc..
// buffer MAX 1Ko
HTSEXT_API char *unescape_http(char *catbuff, const char *s) {
diff --git a/src/htsparse.c b/src/htsparse.c
index 419d882..52445b3 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -2092,11 +2092,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
*a = '\0';
} else
query[0] = '\0';
- // conversion &amp; -> & et autres joyeusetés
- unescape_amp(lien);
- unescape_amp(query);
// décoder l'inutile (%2E par exemple) et coder espaces
- // Unescape high-chars foir UTF-8 conversion
+ // Unescape high-chars for UTF-8 conversion
strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset)); /* note: '%' is still escaped */
escape_remove_control(lien);
// ???? No! escape_spc_url(lien);
@@ -2115,6 +2112,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
free(s);
}
}
+ // conversion entities
+ unescape_amp(lien);
+ unescape_amp(query);
}
// convertir les éventuels \ en des / pour éviter des problèmes de reconnaissance!