From 844ecc37072d515513177c65a8c9dc35c9cdfc1a Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Mon, 19 Mar 2012 12:55:42 +0000 Subject: httrack 3.33.16 --- libtest/callbacks-example-contentfilter.c | 120 ++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100755 libtest/callbacks-example-contentfilter.c (limited to 'libtest/callbacks-example-contentfilter.c') diff --git a/libtest/callbacks-example-contentfilter.c b/libtest/callbacks-example-contentfilter.c new file mode 100755 index 0000000..54ee9c0 --- /dev/null +++ b/libtest/callbacks-example-contentfilter.c @@ -0,0 +1,120 @@ +/* + HTTrack external callbacks example : crawling html pages depending on content + Example of _init and _exit call (httrack >> 3.31) + .c file + + How to use: + - compile this file as a module (callback.so or callback.dll) + example: + (with gcc) + gcc -O -g3 -Wall -D_REENTRANT -DINET6 -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -shared -o callback.so callbacks-example-contentfilter.c + or (with visual c++) + cl -LD -nologo -W3 -Zi -Zp4 -DWIN32 -Fe"callback.dll" callbacks-example-contentfilter.c + - use the --wrapper option in httrack: + httrack --wrapper save-name=callback:process,string[,string..] +*/ + +#include +#include +#include + +/* "External" */ +#ifdef _WIN32 +#define EXTERNAL_FUNCTION __declspec(dllexport) +#else +#define EXTERNAL_FUNCTION +#endif + +/* Function definitions */ +EXTERNAL_FUNCTION int process(char* html, int len, char* address, char* filename); +EXTERNAL_FUNCTION int wrapper_init(char* module, char* initString); +EXTERNAL_FUNCTION int wrapper_exit(void); + +/* TOLOWER */ +#define TOLOWER_(a) (a >= 'A' && a <= 'Z') ? (a + ('a' - 'A')) : a +#define TOLOWER(a) ( TOLOWER_( (a) ) ) + +/* + This sample just crawls pages that contains certain keywords, and skips the other ones +*/ + +static char stringfilter[8192]; +static char* stringfilters[128]; +static int initialized = 0; + +/* +"check-html" callback +from htsdefines.h: +typedef int (* t_hts_htmlcheck)(char* html,int len,char* address,char* filename); +*/ +EXTERNAL_FUNCTION int process(char* html, int len, char* address, char* filename) { + int i = 0; + int getIt = 0; + char* pos; + if (!initialized) { + fprintf(stderr, "** ERROR! process_init() was not called by httrack - you are probably using an old version (<3.31)\n"); + fprintf(stderr, "** bailing out..\n"); + exit(1); + } + if (strcmp(address, "primary") == 0 && strcmp(filename, "/primary") == 0) /* primary page (list of links) */ + return 1; + while(stringfilters[i] != NULL && ! getIt) { + if ( ( pos = strstr(html, stringfilters[i]) ) != NULL) { + int j; + getIt = 1; + fprintf(stderr, "** callback info: found '%s' keyword in '%s%s', crawling this page!\n", stringfilters[i], address, filename); + fprintf(stderr, "** details:\n(..)"); + for(j = 0; j < 72 && pos[j] ; j++) { + if (pos[j] > 32) + fprintf(stderr, "%c", pos[j]); + else + fprintf(stderr, "?"); + } + fprintf(stderr, "(..)\n"); + } + i++; + } + if (getIt) { + return 1; /* success */ + } else { + fprintf(stderr, "** callback info: won't parse '%s%s' (no specified keywords found)\n", address, filename); + return 0; /* this page sucks, don't parse it */ + } +} + +/* _init() will be called, if exists, upon startup */ +EXTERNAL_FUNCTION int wrapper_init(char* module, char* initString) { + char* a = stringfilter; + int i = 0; + fprintf(stderr, "** info: wrapper_init(%s, %s) called!\n", module, initString); + fprintf(stderr, "** callback example: crawling pages only if specific keywords are found\n"); + if (initString == NULL || *initString == '\0') { + fprintf(stderr, "** callback error: arguments expected or bad arguments\n"); + fprintf(stderr, "usage: httrack --wrapper save-name=callback:process,stringtofind,stringtofind..\n"); + fprintf(stderr, "example: httrack --wrapper save-name=callback:process,apple,orange,lemon\n"); + return 0; + } + + /* stringfilters = split(initString, ','); */ + strcpy(stringfilter, initString); + while(a != NULL) { + stringfilters[i] = a; + a = strchr(a, ','); + if (a != NULL) { + *a = '\0'; + a ++; + } + fprintf(stderr, "** callback info: will crawl pages with '%s' in them\n", stringfilters[i]); + i++; + } + stringfilters[i++] = NULL; + initialized = 1; /* we're ok */ + return 1; /* success */ +} + +/* _exit() will be called, if exists, upon exit */ +EXTERNAL_FUNCTION int wrapper_exit(void) { + fprintf(stderr, "** info: wrapper_exit() called!\n"); + initialized = 0; + return 1; /* success (result ignored anyway in xx_exit) */ +} -- cgit v1.2.3