summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhaturatu <taro@eyes4you.org>2024-09-01 02:04:57 +0900
committerhaturatu <taro@eyes4you.org>2024-09-01 02:04:57 +0900
commitf08411e3d4cec3be138eaef300fa419b02bd286d (patch)
tree3926a6c14efe708ab9428530b932601bd63afa07
first commit
-rw-r--r--Gemfile7
-rw-r--r--Gemfile.lock45
-rw-r--r--README.md1
-rwxr-xr-xgeturl.rb87
-rwxr-xr-xmd.rb26
5 files changed, 166 insertions, 0 deletions
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..0a88729
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,7 @@
+source 'https://rubygems.org'
+
+gem 'parallel'
+gem 'faraday'
+gem 'nokogiri'
+gem 'charlock_holmes'
+gem 'unicode_utils'
diff --git a/Gemfile.lock b/Gemfile.lock
new file mode 100644
index 0000000..fdce3bb
--- /dev/null
+++ b/Gemfile.lock
@@ -0,0 +1,45 @@
+GEM
+ remote: https://rubygems.org/
+ specs:
+ charlock_holmes (0.7.9)
+ faraday (1.2.0)
+ multipart-post (>= 1.2, < 3)
+ ruby2_keywords
+ faraday_middleware (1.2.0)
+ faraday (~> 1.0)
+ multipart-post (2.4.1)
+ nokogiri (1.16.7-aarch64-linux)
+ racc (~> 1.4)
+ nokogiri (1.16.7-arm-linux)
+ racc (~> 1.4)
+ nokogiri (1.16.7-arm64-darwin)
+ racc (~> 1.4)
+ nokogiri (1.16.7-x86-linux)
+ racc (~> 1.4)
+ nokogiri (1.16.7-x86_64-darwin)
+ racc (~> 1.4)
+ nokogiri (1.16.7-x86_64-linux)
+ racc (~> 1.4)
+ parallel (1.26.3)
+ racc (1.8.1)
+ ruby2_keywords (0.0.5)
+ unicode_utils (1.4.0)
+
+PLATFORMS
+ aarch64-linux
+ arm-linux
+ arm64-darwin
+ x86-linux
+ x86_64-darwin
+ x86_64-linux
+
+DEPENDENCIES
+ charlock_holmes
+ faraday
+ faraday_middleware
+ nokogiri
+ parallel
+ unicode_utils
+
+BUNDLED WITH
+ 2.5.18
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f39fe18
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+# paint-it-url
diff --git a/geturl.rb b/geturl.rb
new file mode 100755
index 0000000..2e446d4
--- /dev/null
+++ b/geturl.rb
@@ -0,0 +1,87 @@
+#!/usr/bin/env ruby
+require 'bundler/setup'
+Bundler.require
+require 'parallel'
+require 'faraday'
+require 'nokogiri'
+require 'charlock_holmes'
+require 'unicode_utils'
+
+FILE_PATH = '/your/bookmark/txtfile'
+RESULT_FILE = 'Result'
+OTHER_ERROR_FILE = 'Other'
+CONCURRENCY = 10
+
+def is_garbled?(text)
+ text.include?('�') || text.chars.any? { |char| char.ord > 0xFFFF }
+end
+
+def clean_title(title)
+ title = title.chars.reject { |ch| UnicodeUtils.general_category(ch).start_with?('C') }.join
+ title = UnicodeUtils.nfkc(title)
+ title = title.chars.select(&:valid_encoding?).join
+ title.strip
+end
+
+def get_page_title(url)
+ conn = Faraday.new(url: url, ssl: { verify: false }) do |faraday|
+ faraday.adapter Faraday.default_adapter
+ faraday.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+ end
+
+ response = conn.get do |req|
+ req.options.timeout = 30
+ end
+
+ content = response.body
+ encodings = ['UTF-8', 'Shift_JIS', 'EUC-JP', 'ISO-2022-JP', 'Windows-31J']
+
+ title = nil
+ encodings.each do |encoding|
+ begin
+ text = content.force_encoding(encoding).encode('UTF-8', invalid: :replace, undef: :replace)
+ doc = Nokogiri::HTML(text)
+ title = doc.at_css('title')&.text&.strip || 'No title found'
+ break unless is_garbled?(title)
+ rescue
+ next
+ end
+ end
+
+ if is_garbled?(title)
+ detection = CharlockHolmes::EncodingDetector.detect(content)
+ encoding = detection[:encoding] || 'UTF-8'
+ text = content.force_encoding(encoding).encode('UTF-8', invalid: :replace, undef: :replace)
+ doc = Nokogiri::HTML(text)
+ title = doc.at_css('title')&.text&.strip || 'No title found'
+ if is_garbled?(title)
+ title = url.split('/').last.gsub('-', ' ').gsub('_', ' ').capitalize
+ title = 'Error: Unable to extract title' if title.empty?
+ end
+ end
+
+ [url, clean_title(title)]
+rescue => e
+ [url, "Error: #{e.class} - #{e.message}"]
+end
+
+def process_url(url)
+ url, title = get_page_title(url)
+ if title.start_with?('Error:')
+ File.open(OTHER_ERROR_FILE, 'a') { |f| f.puts "URL: #{url}\nError: #{title}\n\n" }
+ else
+ File.open(RESULT_FILE, 'a') { |f| f.puts "URL: #{url}\nTitle: #{title}\n\n" }
+ end
+ puts "URL: #{url}\nTitle: #{title}\n\n"
+end
+
+def process_urls(file_path)
+ urls = File.readlines(file_path).map(&:strip).reject(&:empty?)
+
+ Parallel.each(urls, in_threads: CONCURRENCY) do |url|
+ process_url(url)
+ sleep(rand(1.0..3.0))
+ end
+end
+
+process_urls(FILE_PATH)
diff --git a/md.rb b/md.rb
new file mode 100755
index 0000000..1a8da2b
--- /dev/null
+++ b/md.rb
@@ -0,0 +1,26 @@
+#!/usr/bin/env ruby
+
+def convert_to_markdown(input_file, output_file)
+ content = File.read(input_file, encoding: 'utf-8')
+
+ pairs = content.scan(/URL: (.*?)\nTitle: (.*?)\n/m)
+
+ File.open(output_file, 'w', encoding: 'utf-8') do |f|
+ pairs.each do |url, title|
+ # URLが空でない場合のみ処理
+ next if url.strip.empty?
+
+ # タイトルが空の場合、URLの最後の部分を使用
+ title = url.split('/')[-1] if title.strip.empty?
+
+ # 特殊文字をエスケープ
+ title = title.gsub('[', '\\[').gsub(']', '\\]')
+
+ # md形式のリンクを作成
+ markdown_link = "[#{title}](#{url})\n\n"
+ f.write(markdown_link)
+ end
+ end
+end
+
+convert_to_markdown('Result', 'Result.md')