#!/bin/bash # src=html40.txt url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt dest=htsentities.h ( cat <) for each entity. We should in theory check using strncmp() that we actually have the correct entity, but this is actually statistically not needed. We may want to do better, but we expect the hash function to be uniform, and let the compiler be smart enough to optimize the switch (for example by checking in log2() intervals) This code has been generated using the evil $0 script. */ static int decode_entity(const unsigned int hash, const size_t len) { switch(hash) { EOF ( if test -f ${src}; then cat ${src} else GET "${url}" fi ) \ | grep -E '^$//' \ -e 's/$[^ ]*$ CDATA "&#$[^\"]*$;" -- $.*$/\1 \2 \3/'\ | ( \ read A while test -n "$A"; do ent="${A%% *}" code=$(echo "$A"|cut -f2 -d' ') # compute hash hash=0 i=0 a=1664525 c=1013904223 m="$[1 << 32]" while test "$i" -lt ${#ent}; do d="$(echo -n "${ent:${i}:1}"|hexdump -v -e '/1 "%d"')" hash="$[((${hash}*${a})%(${m})+${d}+${c})%(${m})]" i=$[${i}+1] done echo -e " /* $A */" echo -e " case ${hash}u:" echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {" echo -e " return ${code};" echo -e " }" echo -e " break;" # next read A done ) cat < ${dest}