summaryrefslogtreecommitdiff
path: root/src/htsentities.sh
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-05-30 19:04:51 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-05-30 19:04:51 +0000
commit850f165f4ac90a6e6687c392ddfdd0c6a05b3fe5 (patch)
tree7cac2f01468639c4ab63fe523c17d7638e8cd2ac /src/htsentities.sh
parent01af2a5e73f53ebf8a092e4bda77cd1326c1da11 (diff)
Added hts_unescape_entities(), a rewrite of the HTML entities decoder.
Fixed HTML entities decoding which was done before charset decoding.
Diffstat (limited to 'src/htsentities.sh')
-rwxr-xr-xsrc/htsentities.sh75
1 files changed, 75 insertions, 0 deletions
diff --git a/src/htsentities.sh b/src/htsentities.sh
new file mode 100755
index 0000000..5039aee
--- /dev/null
+++ b/src/htsentities.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#
+
+src=html40.txt
+url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt
+dest=htsentities.h
+
+(
+ cat <<EOF
+/*
+ -- ${dest} --
+ FILE GENERATED BY $0, DO NOT MODIFY
+
+ We compute the LCG hash
+ (see <http://en.wikipedia.org/wiki/Linear_congruential_generator>)
+ for each entity. We should in theory check using strncmp() that we
+ actually have the correct entity, but this is actually statistically
+ not needed.
+
+ We may want to do better, but we expect the hash function to be uniform, and
+ let the compiler be smart enough to optimize the switch (for example by
+ checking in log2() intervals)
+
+ This code has been generated using the evil $0 script.
+*/
+
+static int decode_entity(const unsigned int hash, const size_t len) {
+ switch(hash) {
+EOF
+ (
+ if test -f ${src}; then
+ cat ${src}
+ else
+ GET "${url}"
+ fi
+ ) \
+ | grep -E '^<!ENTITY [a-zA-Z0-9_]' \
+ | sed \
+ -e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
+ -e 's/-->$//' \
+ -e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/'\
+| ( \
+ read A
+ while test -n "$A"; do
+ ent="${A%% *}"
+ code=$(echo "$A"|cut -f2 -d' ')
+ # compute hash
+ hash=0
+ i=0
+ a=1664525
+ c=1013904223
+ m="$[1 << 32]"
+ while test "$i" -lt ${#ent}; do
+ d="$(echo -n "${ent:${i}:1}"|hexdump -v -e '/1 "%d"')"
+ hash="$[((${hash}*${a})%(${m})+${d}+${c})%(${m})]"
+ i=$[${i}+1]
+ done
+ echo -e " /* $A */"
+ echo -e " case ${hash}:"
+ echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
+ echo -e " return ${code};"
+ echo -e " }"
+ echo -e " break;"
+
+ # next
+ read A
+ done
+ )
+ cat <<EOF
+ }
+ /* unknown */
+ return -1;
+}
+EOF
+) > ${dest}