summaryrefslogtreecommitdiff
path: root/libtest/callbacks-example-contentfilter.c
blob: 54ee9c0ea280c09229ab533c9986e1f9a4c2d0e2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
    HTTrack external callbacks example : crawling html pages depending on content
    Example of <wrappername>_init and <wrappername>_exit call (httrack >> 3.31)
    .c file

    How to use:
    - compile this file as a module (callback.so or callback.dll)
      example:
      (with gcc)
      gcc -O -g3 -Wall -D_REENTRANT -DINET6 -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -shared -o callback.so callbacks-example-contentfilter.c
      or (with visual c++)
      cl -LD -nologo -W3 -Zi -Zp4 -DWIN32 -Fe"callback.dll" callbacks-example-contentfilter.c
    - use the --wrapper option in httrack:
      httrack --wrapper save-name=callback:process,string[,string..]
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* "External" */
#ifdef _WIN32
#define EXTERNAL_FUNCTION __declspec(dllexport)
#else
#define EXTERNAL_FUNCTION 
#endif

/* Function definitions */
EXTERNAL_FUNCTION int process(char* html, int len, char* address, char* filename);
EXTERNAL_FUNCTION int wrapper_init(char* module, char* initString);
EXTERNAL_FUNCTION int wrapper_exit(void);

/* TOLOWER */
#define TOLOWER_(a) (a >= 'A' && a <= 'Z') ? (a + ('a' - 'A')) : a
#define TOLOWER(a) ( TOLOWER_( (a) ) )

/*
  This sample just crawls pages that contains certain keywords, and skips the other ones
*/

static char stringfilter[8192];
static char* stringfilters[128];
static int initialized = 0;

/*
"check-html" callback
from htsdefines.h:
typedef int   (* t_hts_htmlcheck)(char* html,int len,char* address,char* filename);
*/
EXTERNAL_FUNCTION int process(char* html, int len, char* address, char* filename) {
  int i = 0;
  int getIt = 0;
  char* pos;
  if (!initialized) {
    fprintf(stderr, "** ERROR! process_init() was not called by httrack - you are probably using an old version (<3.31)\n");
    fprintf(stderr, "** bailing out..\n");
    exit(1);
  }
  if (strcmp(address, "primary") == 0 && strcmp(filename, "/primary") == 0)      /* primary page (list of links) */
    return 1;
  while(stringfilters[i] != NULL && ! getIt) {
    if ( ( pos = strstr(html, stringfilters[i]) ) != NULL) {
      int j;
      getIt = 1;
      fprintf(stderr, "** callback info: found '%s' keyword in '%s%s', crawling this page!\n", stringfilters[i], address, filename);
      fprintf(stderr, "** details:\n(..)");
      for(j = 0; j < 72 && pos[j] ; j++) {
        if (pos[j] > 32)
          fprintf(stderr, "%c", pos[j]);
        else
          fprintf(stderr, "?");
      }
      fprintf(stderr, "(..)\n");
    }
    i++;
  }
  if (getIt) {
    return 1;  /* success */
  } else {
    fprintf(stderr, "** callback info: won't parse '%s%s' (no specified keywords found)\n", address, filename);
    return 0;  /* this page sucks, don't parse it */
  }
}

/* <wrappername>_init() will be called, if exists, upon startup */
EXTERNAL_FUNCTION int wrapper_init(char* module, char* initString) {
  char* a = stringfilter;
  int i = 0;
  fprintf(stderr, "** info: wrapper_init(%s, %s) called!\n", module, initString);
  fprintf(stderr, "** callback example: crawling pages only if specific keywords are found\n");
  if (initString == NULL || *initString == '\0') {
    fprintf(stderr, "** callback error: arguments expected or bad arguments\n");
    fprintf(stderr, "usage: httrack --wrapper save-name=callback:process,stringtofind,stringtofind..\n");
    fprintf(stderr, "example: httrack --wrapper save-name=callback:process,apple,orange,lemon\n");
    return 0;
  }

  /* stringfilters = split(initString, ','); */
  strcpy(stringfilter, initString);
  while(a != NULL) {
    stringfilters[i] = a;
    a = strchr(a, ',');
    if (a != NULL) {
      *a = '\0';
      a ++;
    }
    fprintf(stderr, "** callback info: will crawl pages with '%s' in them\n", stringfilters[i]);
    i++;
  }
  stringfilters[i++] = NULL;
  initialized = 1;      /* we're ok */
  return 1;  /* success */
}

/* <wrappername>_exit() will be called, if exists, upon exit */
EXTERNAL_FUNCTION int wrapper_exit(void) {
  fprintf(stderr, "** info: wrapper_exit() called!\n");
  initialized = 0;
  return 1;   /* success (result ignored anyway in xx_exit) */
}