geturl.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

#!/usr/bin/env ruby
require 'bundler/setup'
Bundler.require
require 'parallel'
require 'faraday'
require 'nokogiri'
require 'charlock_holmes'
require 'unicode_utils'

$FILE_PATH = './ok'
$RESULT_FILE = 'Result'
$OTHER_ERROR_FILE = 'Other'
$CONCURRENCY = 10

def is_garbled?(text)
  text.include?('�') || text.chars.any? { |char| char.ord > 0xFFFF }
end

def clean_title(title)
  title = title.chars.reject { |ch| UnicodeUtils.general_category(ch).start_with?('C') && !['(', ')', '[', ']', '{', '}', '【', '】', '「', '」', '（' ,'）' ].include?(ch) }.join
  title = UnicodeUtils.nfkc(title)
  title = title.chars.select(&:valid_encoding?).join
  title.strip
end

def get_page_title(url)
  conn = Faraday.new(url: url, ssl: { verify: false }) do |faraday|
    faraday.adapter Faraday.default_adapter
    faraday.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  end

  response = conn.get do |req|
    req.options.timeout = 30
  end

  content = response.body
  encodings = ['UTF-8', 'Shift_JIS', 'EUC-JP', 'ISO-2022-JP', 'Windows-31J']
  
  title = nil
  encodings.each do |encoding|
    begin
      text = content.force_encoding(encoding).encode('UTF-8', invalid: :replace, undef: :replace)
      doc = Nokogiri::HTML(text)
      title = doc.at_css('title')&.text&.strip || 'タイトルが見つかりません'
      break unless is_garbled?(title)
    rescue
      next
    end
  end

  if is_garbled?(title)
    detection = CharlockHolmes::EncodingDetector.detect(content)
    encoding = detection[:encoding] || 'UTF-8'
    text = content.force_encoding(encoding).encode('UTF-8', invalid: :replace, undef: :replace)
    doc = Nokogiri::HTML(text)
    title = doc.at_css('title')&.text&.strip || 'タイトルが見つかりません'
    if is_garbled?(title)
      title = url.split('/').last.gsub('-', ' ').gsub('_', ' ').capitalize
      title = 'エラー: タイトルを抽出できません' if title.empty?
    end
  end

  [url, clean_title(title)]
rescue => e
  [url, "エラー: #{e.class} - #{e.message}"]
end

def process_url(url)
  url, title = get_page_title(url)
  if title.start_with?('エラー:')
    File.open($OTHER_ERROR_FILE, 'a') { |f| f.puts "URL: #{url}\nエラー: #{title}\n\n" }
  else
    File.open($RESULT_FILE, 'a') { |f| f.puts "URL: #{url}\nタイトル: #{title}\n\n" }
  end
  puts "URL: #{url}\nタイトル: #{title}\n\n"
end

def process_urls(file_path)
  unless File.exist?(file_path)
    puts "エラー: 指定されたファイル '#{file_path}' が見つかりません。"
    exit(1)
  end

  urls = File.readlines(file_path).map(&:strip).reject(&:empty?)
  
  if urls.empty?
    puts "警告: ファイル '#{file_path}' にURLが含まれていません。"
    exit(0)
  end

  Parallel.each(urls, in_threads: $CONCURRENCY) do |url|
    process_url(url)
    sleep(rand(1.0..3.0))
  end
end

begin
  process_urls($FILE_PATH)
rescue Errno::ENOENT => e
  puts "エラー: ファイルが見つかりません - #{e.message}"
  exit(1)
rescue Errno::EACCES => e
  puts "エラー: ファイルへのアクセス権限がありません - #{e.message}"
  exit(1)
rescue => e
  puts "予期せぬエラーが発生しました: #{e.message}"
  puts e.backtrace
  exit(1)
end