summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-05-13 11:17:31 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-05-13 11:17:31 +0000
commit8cdb6d6fb056575f58eaf7e1a8fae2f31c321d13 (patch)
tree03cf143582b475f3a973ee5569501e7d313a5645 /src
parent1c496d66c8d39fac3ec5dee010e5ef7f3cba83ba (diff)
Limit the final destination path lenght to 256 (Windows compatibility)
See http://code.google.com/p/httrack/issues/detail?id=9
Diffstat (limited to 'src')
-rw-r--r--src/htscharset.c12
-rw-r--r--src/htscharset.h8
-rw-r--r--src/htsname.c51
3 files changed, 69 insertions, 2 deletions
diff --git a/src/htscharset.c b/src/htscharset.c
index f19e47c..351fa8c 100644
--- a/src/htscharset.c
+++ b/src/htscharset.c
@@ -502,6 +502,18 @@ static int is_space_or_equal_or_quote(char c) {
return is_space_or_equal(c) || c == '"' || c == '\'';
}
+size_t hts_stringLengthUTF8(const char *s) {
+ const unsigned char *const bytes = (const unsigned char*) s;
+ size_t i, len;
+ for(i = 0, len = 0 ; bytes[i] != '\0' ; i++) {
+ const unsigned char c = bytes[i];
+ if (HTS_IS_LEADING_UTF8(c)) { // ASCII or leading byte
+ len++;
+ }
+ }
+ return len;
+}
+
char* hts_getCharsetFromMeta(const char *html, size_t size) {
int i;
// <META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf-8" >
diff --git a/src/htscharset.h b/src/htscharset.h
index f8f4ca6..55aa2a8 100644
--- a/src/htscharset.h
+++ b/src/htscharset.h
@@ -44,6 +44,9 @@ Please visit our Website: http://www.httrack.com
#include <windows.h>
#endif
+/** Leading character (ASCII or leading UTF-8 sequence) **/
+#define HTS_IS_LEADING_UTF8(C) ((unsigned char)(C) < 0x80 || (unsigned char)(C) >= 0xc0)
+
/**
* Convert the string "s" from charset "charset" to UTF-8.
* Return NULL upon error.
@@ -78,6 +81,11 @@ extern char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize);
**/
extern char *hts_convertStringSystemToUTF8(const char *s, size_t size);
+/**
+ * Get an UTF-8 string length in characters.
+ **/
+extern size_t hts_stringLengthUTF8(const char *s);
+
#endif
#endif
diff --git a/src/htsname.c b/src/htsname.c
index 641e7aa..cc6c344 100644
--- a/src/htsname.c
+++ b/src/htsname.c
@@ -1380,7 +1380,55 @@ int url_savename2(char* adr_complete, char* fil_complete, char* save,
} else if (!IS_DELAYED_EXT(save)) {
strcatbuff(lastDot, "." DELAYED_EXT);
}
- }
+ }
+
+ // enforce 256-character path limit before inserting destination path
+#define HTS_MAX_PATH_LEN 250
+#define MIN_LAST_SEG_RESERVE 30
+ if (hts_stringLengthUTF8(save) + hts_stringLengthUTF8(StringBuff(opt->path_html_utf8)) >= HTS_MAX_PATH_LEN) {
+ char BIGSTK tempo[HTS_URLMAXSIZE*2];
+ const size_t parentLen = hts_stringLengthUTF8(StringBuff(opt->path_html_utf8));
+ // parent path length is not insane (otherwise, ignore and pick 200 as suffix length)
+ const size_t maxLen = parentLen < HTS_MAX_PATH_LEN/2 ? HTS_MAX_PATH_LEN - parentLen : HTS_MAX_PATH_LEN;
+ size_t i, j, lastSeg, sofar;
+ // pick up last segment
+ for(i = 0, lastSeg = 0 ; save[i] != '\0' ; i++) {
+ if (save[i] == '/') {
+ lastSeg = i + 1;
+ }
+ }
+ // add as much pathes as we can
+ for(i = 0, sofar = 0 ; i < lastSeg && i + MIN_LAST_SEG_RESERVE < maxLen ; i++) {
+ tempo[i] = save[i];
+ if (save[i] == '/') {
+ // validate segment so far
+ sofar = i + 1;
+ }
+ }
+ // last segment
+#define MAX_UTF8_SEQ_CHARS 4
+ for(j = 0 ; save[j + i] != '\0' ; j++) {
+ // Stop here before next sequence
+ if (sofar + j + MAX_UTF8_SEQ_CHARS >= maxLen && HTS_IS_LEADING_UTF8(save[i + j])) {
+ break;
+ }
+ // Stop is overflowing
+ else if (sofar + j >= maxLen) {
+ break;
+ }
+ save[sofar + j] = save[i + j];
+ }
+ // terminating \0
+ save[sofar + j] = '\0';
+ // log in debug
+ if ( (opt->debug>1) && (opt->log!=NULL) ) {
+ HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Too long filename shortened: %s%s => %s"LF,adr_complete,fil_complete,sav);
+ test_flush;
+ }
+ }
+#undef MAX_UTF8_SEQ_CHARS
+#undef MIN_LAST_SEG_RESERVE
+#undef HTS_MAX_PATH_LEN
// chemin primaire éventuel A METTRE AVANT
if (strnotempty(StringBuff(opt->path_html_utf8))) {
@@ -1390,7 +1438,6 @@ int url_savename2(char* adr_complete, char* fil_complete, char* save,
strcpybuff(save,tempo);
}
-
// vérifier que le nom n'est pas déja pris...
if (liens!=NULL) {
int nom_ok;