From df23b04506e6b4cdfa5c40ce52172728bfe1d063 Mon Sep 17 00:00:00 2001 From: haturatu Date: Sun, 15 Dec 2024 00:54:08 +0900 Subject: add charset --- chardet/chardet.go | 23 +++++++++++++++++++++++ go.mod | 4 +++- go.sum | 6 ++++-- main.go | 12 ++++++++++-- 4 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 chardet/chardet.go diff --git a/chardet/chardet.go b/chardet/chardet.go new file mode 100644 index 0000000..9fb68bc --- /dev/null +++ b/chardet/chardet.go @@ -0,0 +1,23 @@ +package chardet + +import ( + "bytes" + "io" + "golang.org/x/net/html/charset" +) + +func DetectAndDecode(r io.Reader) (*bytes.Reader, error) { + decoded, err := charset.NewReader(r, "text/html") + if err != nil { + return nil, err + } + + buf := new(bytes.Buffer) + _, err = io.Copy(buf, decoded) + if err != nil { + return nil, err + } + + return bytes.NewReader(buf.Bytes()), nil +} + diff --git a/go.mod b/go.mod index 188e966..9e6c573 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,6 @@ module ght go 1.23.3 -require golang.org/x/net v0.31.0 +require golang.org/x/text v0.21.0 // indirect + +require golang.org/x/net v0.32.0 diff --git a/go.sum b/go.sum index 26142d0..704655a 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,4 @@ -golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= -golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= +golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI= +golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= diff --git a/main.go b/main.go index 903abc4..221783f 100644 --- a/main.go +++ b/main.go @@ -7,6 +7,8 @@ import ( "os" "time" + "ght/chardet" + "golang.org/x/net/html" ) @@ -39,7 +41,13 @@ func fetchAndParse(client *http.Client, url string, useRange bool) (string, erro } defer resp.Body.Close() - doc, err := html.Parse(resp.Body) + // encoding and decode + body, err := chardet.DetectAndDecode(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to decode response body: %w", err) + } + + doc, err := html.Parse(body) if err != nil { return "", fmt.Errorf("failed to parse HTML: %w", err) } @@ -64,7 +72,7 @@ func fetchTitle(url string) (string, error) { return title, nil } - // no range limit : get reqest + // no range limit : get request title, err = fetchAndParse(client, url, false) if err != nil { return "", err -- cgit v1.2.3