From 83144137513b17a962c6a3ba78fed498f5cef2b1 Mon Sep 17 00:00:00 2001 From: Juanito Fatas Date: Sat, 1 Jun 2019 15:27:25 +0900 Subject: [PATCH 1/2] Only package necessary files using easier to understand Dir instead of git command --- twemoji.gemspec | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/twemoji.gemspec b/twemoji.gemspec index a6a54c5..fc71fce 100644 --- a/twemoji.gemspec +++ b/twemoji.gemspec @@ -13,10 +13,15 @@ Gem::Specification.new do |spec| spec.description = spec.summary spec.homepage = "https://github.com/jollygoodcode/twemoji" spec.license = "MIT" - - spec.files = `git ls-files -z`.split("\x0").reject { |f| f =~ %r(^(test)/) } spec.require_paths = %w(lib) + spec.files = Dir[ + "README.md", + "LICENSE.md", + "lib/**/*.yml", + "lib/**/*.rb", + ] + spec.required_ruby_version = "~> 2.0" spec.add_dependency "nokogiri", "~> 1.6" From 2712b9e8fec81ede9b800fa097d4c9d9273c6074 Mon Sep 17 00:00:00 2001 From: Juanito Fatas Date: Sun, 2 Jun 2019 17:58:59 +0900 Subject: [PATCH 2/2] Add rake db:dump to generate unicode emojis json The Rakefile is defined under lib/tasks/db.rake. Necessary files will download from https://github.com/unicode-org/cldr, and save in vendor folder under cldr release version folder. - parse emojis from cldr release, version can be changed at emoji_data_files. The emojis are from emoji_test.txt (starting from release-32) and keywords (names) are from cldr annotations and annotationsDerived. --- .gitignore | 1 + Rakefile | 1 + lib/tasks/db.rake | 42 ++++++++ lib/twemoji/db/cldr/annotations.rb | 56 ++++++++++ lib/twemoji/db/cldr/emoji_annotation.rb | 10 ++ lib/twemoji/db/cldr/emoji_annotations.rb | 23 ++++ lib/twemoji/db/emoji_data_files.rb | 40 +++++++ lib/twemoji/db/unicode/emoji.rb | 28 +++++ lib/twemoji/db/unicode/emoji_category.rb | 19 ++++ lib/twemoji/db/unicode/emoji_test_parser.rb | 113 ++++++++++++++++++++ 10 files changed, 333 insertions(+) create mode 100644 lib/tasks/db.rake create mode 100644 lib/twemoji/db/cldr/annotations.rb create mode 100644 lib/twemoji/db/cldr/emoji_annotation.rb create mode 100644 lib/twemoji/db/cldr/emoji_annotations.rb create mode 100644 lib/twemoji/db/emoji_data_files.rb create mode 100644 lib/twemoji/db/unicode/emoji.rb create mode 100644 lib/twemoji/db/unicode/emoji_category.rb create mode 100644 lib/twemoji/db/unicode/emoji_test_parser.rb diff --git a/.gitignore b/.gitignore index ae3fdc2..9804ed0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ *.o *.a mkmf.log +vendor/ diff --git a/Rakefile b/Rakefile index aec2778..4aeef5a 100644 --- a/Rakefile +++ b/Rakefile @@ -1,6 +1,7 @@ #!/usr/bin/env rake require "bundler/gem_tasks" require "rake/testtask" +load "lib/tasks/db.rake" Rake::TestTask.new do |t| t.libs << "test" diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake new file mode 100644 index 0000000..6824777 --- /dev/null +++ b/lib/tasks/db.rake @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +require "fileutils" +require "json" + +require_relative "../twemoji/db/unicode/emoji_test_parser" + +def curl_download(url, output_file) + puts "Downloading #{url} => #{output_file}" + directory = File.dirname(output_file) + FileUtils.mkdir_p(directory) if !File.exist?(directory) + system "curl", "-fsSL", url, "-o", output_file +end + +file EmojiDataFiles.emoji_test_file do |task| + curl_download(EmojiDataFiles.emoji_test_url, task.name) +end + +file EmojiDataFiles.annotations_file do |task| + curl_download(EmojiDataFiles.annotations_url, task.name) +end + +file EmojiDataFiles.annotations_derived_file do |task| + curl_download(EmojiDataFiles.annotations_derived_url, task.name) +end + +namespace :db do + desc %(Prepare data files needed for generating emojis.json) + task prepare_files: [ + EmojiDataFiles.emoji_test_file, + EmojiDataFiles.annotations_file, + EmojiDataFiles.annotations_derived_file, + ] + + desc "Generate emojis.json to db folder" + task dump: :prepare_files do + emojis = EmojiTestParser.parse + + puts JSON.pretty_generate(emojis) + puts "Parsed #{emojis.size} emojis from #{EmojiDataFiles.version}!" + end +end diff --git a/lib/twemoji/db/cldr/annotations.rb b/lib/twemoji/db/cldr/annotations.rb new file mode 100644 index 0000000..0a67e5f --- /dev/null +++ b/lib/twemoji/db/cldr/annotations.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require "nokogiri" +require_relative "../../utils/unicode" + +require_relative "emoji_annotation" +require_relative "emoji_annotations" + +module CLDR + class Annotations + attr_reader :annotations + + def initialize(annotation, annotation_derived) + @annotations = parse_files(annotation, annotation_derived) + end + + private + + def parse_files(annotation, annotation_derived) + parse_file(annotation) + parse_file(annotation_derived) + end + + def parse_file(path) + document = get_document(path) + parse(document) + end + + def parse(document) + result = EmojiAnnotations.new + + document.css("annotations annotation").each do |annotation_node| + next if annotation_node.attributes.key?("type") + + codepoints = get_codepoints(annotation_node.attributes["cp"].text) + keywords = annotation_node.text.split(" | ") + + emoji_annotation = EmojiAnnotation.new( + codepoints: codepoints, + keywords: keywords, + ) + + result.add(emoji_annotation) + end + + result + end + + def get_document(path) + Nokogiri::XML.parse(IO.read(path)) + end + + def get_codepoints(unicode) + Twemoji::Utils::Unicode.unpack(unicode, connector: " ").upcase + end + end +end diff --git a/lib/twemoji/db/cldr/emoji_annotation.rb b/lib/twemoji/db/cldr/emoji_annotation.rb new file mode 100644 index 0000000..9a300fc --- /dev/null +++ b/lib/twemoji/db/cldr/emoji_annotation.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true + +class EmojiAnnotation + attr_reader :codepoints, :keywords + + def initialize(codepoints:, keywords:) + @codepoints = codepoints + @keywords = keywords + end +end diff --git a/lib/twemoji/db/cldr/emoji_annotations.rb b/lib/twemoji/db/cldr/emoji_annotations.rb new file mode 100644 index 0000000..da27101 --- /dev/null +++ b/lib/twemoji/db/cldr/emoji_annotations.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +class EmojiAnnotations + attr_reader :annotations + + def initialize(annotations = []) + @annotations = annotations + end + + def +(emoji_annotations) + self.class.new( + self.annotations + emoji_annotations.annotations + ) + end + + def add(annotation) + @annotations << annotation + end + + def find_by(codepoints:) + annotations.find { |annotation| codepoints == annotation.codepoints } + end +end diff --git a/lib/twemoji/db/emoji_data_files.rb b/lib/twemoji/db/emoji_data_files.rb new file mode 100644 index 0000000..f579eed --- /dev/null +++ b/lib/twemoji/db/emoji_data_files.rb @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +class EmojiDataFiles + VERSION = "release-35-1" + VENDOR_DIR = File.expand_path("../../../vendor", __dir__) + UNICODE_REPO_ROOT = "https://raw.githubusercontent.com/unicode-org/cldr/#{VERSION}" + REMOTE_EMOJI_TEST_PATH = "tools/java/org/unicode/cldr/util/data/emoji/emoji-test.txt" + REMOTE_ANNOTATIONS_PATH = "common/annotations/en.xml" + REMOTE_ANNOTATIONS_DERIVED_PATH = "common/annotationsDerived/en.xml" + private_constant :VERSION, :VENDOR_DIR, :UNICODE_REPO_ROOT + private_constant :REMOTE_EMOJI_TEST_PATH, :REMOTE_ANNOTATIONS_PATH, :REMOTE_ANNOTATIONS_DERIVED_PATH + + def self.version + "CLDR #{VERSION}" + end + + def self.emoji_test_file + File.join(VENDOR_DIR, VERSION, "emoji-test.txt") + end + + def self.annotations_file + File.join(VENDOR_DIR, VERSION, "annotations/en.xml") + end + + def self.annotations_derived_file + File.join(VENDOR_DIR, VERSION, "annotationsDerived/en.xml") + end + + def self.emoji_test_url + File.join(UNICODE_REPO_ROOT, REMOTE_EMOJI_TEST_PATH) + end + + def self.annotations_url + File.join(UNICODE_REPO_ROOT, REMOTE_ANNOTATIONS_PATH) + end + + def self.annotations_derived_url + File.join(UNICODE_REPO_ROOT, REMOTE_ANNOTATIONS_DERIVED_PATH) + end +end diff --git a/lib/twemoji/db/unicode/emoji.rb b/lib/twemoji/db/unicode/emoji.rb new file mode 100644 index 0000000..90a62b5 --- /dev/null +++ b/lib/twemoji/db/unicode/emoji.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +class Emoji + attr_reader :codepoints, :status, :unicode, :description + + def initialize(data) + @codepoints = data[:codepoints].strip + @status = data[:status].strip + @unicode = data[:unicode] + @description = data[:description] + end + + def to_h(group:, subgroup:, keywords:) + { + unicode: unicode, + codepoints: codepoints, + description: description, + keywords: keywords, + group: group, + subgroup: subgroup, + status: status, + } + end + + def inspect + "#" + end +end diff --git a/lib/twemoji/db/unicode/emoji_category.rb b/lib/twemoji/db/unicode/emoji_category.rb new file mode 100644 index 0000000..121d63c --- /dev/null +++ b/lib/twemoji/db/unicode/emoji_category.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +class EmojiCategory + attr_reader :name, :type, :emojis + + def initialize(name, type:, emojis: []) + @name = name + @type = type + @emojis = emojis + end + + def add(emoji) + @emojis << emoji + end + + def inspect + "# objects>" + end +end diff --git a/lib/twemoji/db/unicode/emoji_test_parser.rb b/lib/twemoji/db/unicode/emoji_test_parser.rb new file mode 100644 index 0000000..79b8d40 --- /dev/null +++ b/lib/twemoji/db/unicode/emoji_test_parser.rb @@ -0,0 +1,113 @@ +# frozen_string_literal: true + +require_relative "emoji_category" +require_relative "emoji" +require_relative "../cldr/annotations" +require_relative "../emoji_data_files" + +class EmojiTestParser + def self.parse(file = EmojiDataFiles.emoji_test_file) + new(file).parse + end + + def initialize(file) + @file = file + end + + def parse + emoji_categories = [] + group = nil + subgroup = nil + + IO.readlines(file).each do |line| + case + when line.start_with?("# group: ") + group = parse_group(line) + emoji_categories << group + when line.start_with?("# subgroup: ") + subgroup = parse_subgroup(line) + group.add(subgroup) + when comment?(line) || newline?(line) + next + else + emoji = parse_emoji(line) + subgroup.add(emoji) + end + end + + flatten(emoji_categories) + end + + private + attr_reader :file + + def parse_name(line) + _, name = line.split(":", 2) + name.strip + end + + def parse_group(line) + group_name = parse_name(line) + EmojiCategory.new(group_name, type: "group") + end + + def parse_subgroup(line) + group_name = parse_name(line) + EmojiCategory.new(group_name, type: "subgroup") + end + + def comment?(line) + line.start_with?("#") + end + + def newline?(line) + line.strip.empty? + end + + EMOJI_LINE_REGEXP = /(?.+);(?.+)# (?[^[\s]]+)\s(?.+)/ + private_constant :EMOJI_LINE_REGEXP + + def parse_emoji(line) + matched = line.match(EMOJI_LINE_REGEXP) + Emoji.new(matched) + end + + def flatten(data) + emojis = [] + + data.each do |emoji_category| + emoji_category.emojis.each do |emoji_subcategory| + emoji_subcategory.emojis.each do |raw_emoji| + annotation = get_annotation(raw_emoji.codepoints) + + emoji = raw_emoji.to_h( + group: emoji_category.name, + subgroup: emoji_subcategory.name, + keywords: annotation.keywords, + ) + + emojis << emoji + end + end + end + + emojis + end + + class NullAnnotation + def keywords; []; end + end + + def get_annotation(codepoints) + cldr_annotations.find_by(codepoints: codepoints) || NullAnnotation.new + end + + def cldr_annotations + @cldr_annotations ||= begin + CLDR::Annotations.new( + EmojiDataFiles.annotations_file, + EmojiDataFiles.annotations_derived_file, + ).annotations + end + end +end