diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-05-30 19:04:51 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-05-30 19:04:51 +0000 |
commit | 850f165f4ac90a6e6687c392ddfdd0c6a05b3fe5 (patch) | |
tree | 7cac2f01468639c4ab63fe523c17d7638e8cd2ac /src/htsentities.sh | |
parent | 01af2a5e73f53ebf8a092e4bda77cd1326c1da11 (diff) |
Added hts_unescape_entities(), a rewrite of the HTML entities decoder.
Fixed HTML entities decoding which was done before charset decoding.
Diffstat (limited to 'src/htsentities.sh')
-rwxr-xr-x | src/htsentities.sh | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/src/htsentities.sh b/src/htsentities.sh new file mode 100755 index 0000000..5039aee --- /dev/null +++ b/src/htsentities.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# + +src=html40.txt +url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt +dest=htsentities.h + +( + cat <<EOF +/* + -- ${dest} -- + FILE GENERATED BY $0, DO NOT MODIFY + + We compute the LCG hash + (see <http://en.wikipedia.org/wiki/Linear_congruential_generator>) + for each entity. We should in theory check using strncmp() that we + actually have the correct entity, but this is actually statistically + not needed. + + We may want to do better, but we expect the hash function to be uniform, and + let the compiler be smart enough to optimize the switch (for example by + checking in log2() intervals) + + This code has been generated using the evil $0 script. +*/ + +static int decode_entity(const unsigned int hash, const size_t len) { + switch(hash) { +EOF + ( + if test -f ${src}; then + cat ${src} + else + GET "${url}" + fi + ) \ + | grep -E '^<!ENTITY [a-zA-Z0-9_]' \ + | sed \ + -e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \ + -e 's/-->$//' \ + -e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/'\ +| ( \ + read A + while test -n "$A"; do + ent="${A%% *}" + code=$(echo "$A"|cut -f2 -d' ') + # compute hash + hash=0 + i=0 + a=1664525 + c=1013904223 + m="$[1 << 32]" + while test "$i" -lt ${#ent}; do + d="$(echo -n "${ent:${i}:1}"|hexdump -v -e '/1 "%d"')" + hash="$[((${hash}*${a})%(${m})+${d}+${c})%(${m})]" + i=$[${i}+1] + done + echo -e " /* $A */" + echo -e " case ${hash}:" + echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {" + echo -e " return ${code};" + echo -e " }" + echo -e " break;" + + # next + read A + done + ) + cat <<EOF + } + /* unknown */ + return -1; +} +EOF +) > ${dest} |