diff options
-rw-r--r-- | Gemfile | 7 | ||||
-rw-r--r-- | Gemfile.lock | 45 | ||||
-rw-r--r-- | README.md | 1 | ||||
-rwxr-xr-x | geturl.rb | 87 | ||||
-rwxr-xr-x | md.rb | 26 |
5 files changed, 166 insertions, 0 deletions
@@ -0,0 +1,7 @@ +source 'https://rubygems.org' + +gem 'parallel' +gem 'faraday' +gem 'nokogiri' +gem 'charlock_holmes' +gem 'unicode_utils' diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..fdce3bb --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,45 @@ +GEM + remote: https://rubygems.org/ + specs: + charlock_holmes (0.7.9) + faraday (1.2.0) + multipart-post (>= 1.2, < 3) + ruby2_keywords + faraday_middleware (1.2.0) + faraday (~> 1.0) + multipart-post (2.4.1) + nokogiri (1.16.7-aarch64-linux) + racc (~> 1.4) + nokogiri (1.16.7-arm-linux) + racc (~> 1.4) + nokogiri (1.16.7-arm64-darwin) + racc (~> 1.4) + nokogiri (1.16.7-x86-linux) + racc (~> 1.4) + nokogiri (1.16.7-x86_64-darwin) + racc (~> 1.4) + nokogiri (1.16.7-x86_64-linux) + racc (~> 1.4) + parallel (1.26.3) + racc (1.8.1) + ruby2_keywords (0.0.5) + unicode_utils (1.4.0) + +PLATFORMS + aarch64-linux + arm-linux + arm64-darwin + x86-linux + x86_64-darwin + x86_64-linux + +DEPENDENCIES + charlock_holmes + faraday + faraday_middleware + nokogiri + parallel + unicode_utils + +BUNDLED WITH + 2.5.18 diff --git a/README.md b/README.md new file mode 100644 index 0000000..f39fe18 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# paint-it-url diff --git a/geturl.rb b/geturl.rb new file mode 100755 index 0000000..2e446d4 --- /dev/null +++ b/geturl.rb @@ -0,0 +1,87 @@ +#!/usr/bin/env ruby +require 'bundler/setup' +Bundler.require +require 'parallel' +require 'faraday' +require 'nokogiri' +require 'charlock_holmes' +require 'unicode_utils' + +FILE_PATH = '/your/bookmark/txtfile' +RESULT_FILE = 'Result' +OTHER_ERROR_FILE = 'Other' +CONCURRENCY = 10 + +def is_garbled?(text) + text.include?('�') || text.chars.any? { |char| char.ord > 0xFFFF } +end + +def clean_title(title) + title = title.chars.reject { |ch| UnicodeUtils.general_category(ch).start_with?('C') }.join + title = UnicodeUtils.nfkc(title) + title = title.chars.select(&:valid_encoding?).join + title.strip +end + +def get_page_title(url) + conn = Faraday.new(url: url, ssl: { verify: false }) do |faraday| + faraday.adapter Faraday.default_adapter + faraday.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + end + + response = conn.get do |req| + req.options.timeout = 30 + end + + content = response.body + encodings = ['UTF-8', 'Shift_JIS', 'EUC-JP', 'ISO-2022-JP', 'Windows-31J'] + + title = nil + encodings.each do |encoding| + begin + text = content.force_encoding(encoding).encode('UTF-8', invalid: :replace, undef: :replace) + doc = Nokogiri::HTML(text) + title = doc.at_css('title')&.text&.strip || 'No title found' + break unless is_garbled?(title) + rescue + next + end + end + + if is_garbled?(title) + detection = CharlockHolmes::EncodingDetector.detect(content) + encoding = detection[:encoding] || 'UTF-8' + text = content.force_encoding(encoding).encode('UTF-8', invalid: :replace, undef: :replace) + doc = Nokogiri::HTML(text) + title = doc.at_css('title')&.text&.strip || 'No title found' + if is_garbled?(title) + title = url.split('/').last.gsub('-', ' ').gsub('_', ' ').capitalize + title = 'Error: Unable to extract title' if title.empty? + end + end + + [url, clean_title(title)] +rescue => e + [url, "Error: #{e.class} - #{e.message}"] +end + +def process_url(url) + url, title = get_page_title(url) + if title.start_with?('Error:') + File.open(OTHER_ERROR_FILE, 'a') { |f| f.puts "URL: #{url}\nError: #{title}\n\n" } + else + File.open(RESULT_FILE, 'a') { |f| f.puts "URL: #{url}\nTitle: #{title}\n\n" } + end + puts "URL: #{url}\nTitle: #{title}\n\n" +end + +def process_urls(file_path) + urls = File.readlines(file_path).map(&:strip).reject(&:empty?) + + Parallel.each(urls, in_threads: CONCURRENCY) do |url| + process_url(url) + sleep(rand(1.0..3.0)) + end +end + +process_urls(FILE_PATH) @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby + +def convert_to_markdown(input_file, output_file) + content = File.read(input_file, encoding: 'utf-8') + + pairs = content.scan(/URL: (.*?)\nTitle: (.*?)\n/m) + + File.open(output_file, 'w', encoding: 'utf-8') do |f| + pairs.each do |url, title| + # URLが空でない場合のみ処理 + next if url.strip.empty? + + # タイトルが空の場合、URLの最後の部分を使用 + title = url.split('/')[-1] if title.strip.empty? + + # 特殊文字をエスケープ + title = title.gsub('[', '\\[').gsub(']', '\\]') + + # md形式のリンクを作成 + markdown_link = "[#{title}](#{url})\n\n" + f.write(markdown_link) + end + end +end + +convert_to_markdown('Result', 'Result.md') |