jollygoodcode · JuanitoFatas · Jun 1, 2019 · Jun 2, 2019
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,4 @@
 *.o
 *.a
 mkmf.log
+vendor/
diff --git a/Rakefile b/Rakefile
@@ -1,6 +1,7 @@
 #!/usr/bin/env rake
 require "bundler/gem_tasks"
 require "rake/testtask"
+load "lib/tasks/db.rake"
 
 Rake::TestTask.new do |t|
   t.libs << "test"

diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake
@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+
+require "fileutils"
+require "json"
+
+require_relative "../twemoji/db/unicode/emoji_test_parser"
+
+def curl_download(url, output_file)
+  puts "Downloading #{url} => #{output_file}"
+  directory = File.dirname(output_file)
+  FileUtils.mkdir_p(directory) if !File.exist?(directory)
+  system "curl", "-fsSL", url, "-o", output_file
+end
+
+file EmojiDataFiles.emoji_test_file do |task|
+  curl_download(EmojiDataFiles.emoji_test_url, task.name)
+end
+
+file EmojiDataFiles.annotations_file do |task|
+  curl_download(EmojiDataFiles.annotations_url, task.name)
+end
+
+file EmojiDataFiles.annotations_derived_file do |task|
+  curl_download(EmojiDataFiles.annotations_derived_url, task.name)
+end
+
+namespace :db do
+  desc %(Prepare data files needed for generating emojis.json)
+  task prepare_files: [
+    EmojiDataFiles.emoji_test_file,
+    EmojiDataFiles.annotations_file,
+    EmojiDataFiles.annotations_derived_file,
+  ]
+
+  desc "Generate emojis.json to db folder"
+  task dump: :prepare_files do
+    emojis = EmojiTestParser.parse
+
+    puts JSON.pretty_generate(emojis)
+    puts "Parsed #{emojis.size} emojis from #{EmojiDataFiles.version}!"
+  end
+end
diff --git a/lib/twemoji/db/cldr/annotations.rb b/lib/twemoji/db/cldr/annotations.rb
@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+
+require "nokogiri"
+require_relative "../../utils/unicode"
+
+require_relative "emoji_annotation"
+require_relative "emoji_annotations"
+
+module CLDR
+  class Annotations
+    attr_reader :annotations
+
+    def initialize(annotation, annotation_derived)
+      @annotations = parse_files(annotation, annotation_derived)
+    end
+
+    private
+
+    def parse_files(annotation, annotation_derived)
+      parse_file(annotation) + parse_file(annotation_derived)
+    end
+
+    def parse_file(path)
+      document = get_document(path)
+      parse(document)
+    end
+
+    def parse(document)
+      result = EmojiAnnotations.new
+
+      document.css("annotations annotation").each do |annotation_node|
+        next if annotation_node.attributes.key?("type")
+
+        codepoints = get_codepoints(annotation_node.attributes["cp"].text)
+        keywords = annotation_node.text.split(" | ")
+
+        emoji_annotation = EmojiAnnotation.new(
+          codepoints: codepoints,
+          keywords: keywords,
+        )
+
+        result.add(emoji_annotation)
+      end
+
+      result
+    end
+
+    def get_document(path)
+      Nokogiri::XML.parse(IO.read(path))
+    end
+
+    def get_codepoints(unicode)
+      Twemoji::Utils::Unicode.unpack(unicode, connector: " ").upcase
+    end
+  end
+end
diff --git a/lib/twemoji/db/cldr/emoji_annotation.rb b/lib/twemoji/db/cldr/emoji_annotation.rb
@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+
+class EmojiAnnotation
+  attr_reader :codepoints, :keywords
+
+  def initialize(codepoints:, keywords:)
+    @codepoints = codepoints
+    @keywords = keywords
+  end
+end
diff --git a/lib/twemoji/db/cldr/emoji_annotations.rb b/lib/twemoji/db/cldr/emoji_annotations.rb
@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+
+class EmojiAnnotations
+  attr_reader :annotations
+
+  def initialize(annotations = [])
+    @annotations = annotations
+  end
+
+  def +(emoji_annotations)
+    self.class.new(
+      self.annotations + emoji_annotations.annotations
+    )
+  end
+
+  def add(annotation)
+    @annotations << annotation
+  end
+
+  def find_by(codepoints:)
+    annotations.find { |annotation| codepoints == annotation.codepoints }
+  end
+end
diff --git a/lib/twemoji/db/emoji_data_files.rb b/lib/twemoji/db/emoji_data_files.rb
@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+
+class EmojiDataFiles
+  VERSION = "release-35-1"
+  VENDOR_DIR = File.expand_path("../../../vendor", __dir__)
+  UNICODE_REPO_ROOT = "https://raw.githubusercontent.com/unicode-org/cldr/#{VERSION}"
+  REMOTE_EMOJI_TEST_PATH = "tools/java/org/unicode/cldr/util/data/emoji/emoji-test.txt"
+  REMOTE_ANNOTATIONS_PATH = "common/annotations/en.xml"
+  REMOTE_ANNOTATIONS_DERIVED_PATH = "common/annotationsDerived/en.xml"
+  private_constant :VERSION, :VENDOR_DIR, :UNICODE_REPO_ROOT
+  private_constant :REMOTE_EMOJI_TEST_PATH, :REMOTE_ANNOTATIONS_PATH, :REMOTE_ANNOTATIONS_DERIVED_PATH
+
+  def self.version
+    "CLDR #{VERSION}"
+  end
+
+  def self.emoji_test_file
+    File.join(VENDOR_DIR, VERSION, "emoji-test.txt")
+  end
+
+  def self.annotations_file
+    File.join(VENDOR_DIR, VERSION, "annotations/en.xml")
+  end
+
+  def self.annotations_derived_file
+    File.join(VENDOR_DIR, VERSION, "annotationsDerived/en.xml")
+  end
+
+  def self.emoji_test_url
+    File.join(UNICODE_REPO_ROOT, REMOTE_EMOJI_TEST_PATH)
+  end
+
+  def self.annotations_url
+    File.join(UNICODE_REPO_ROOT, REMOTE_ANNOTATIONS_PATH)
+  end
+
+  def self.annotations_derived_url
+    File.join(UNICODE_REPO_ROOT, REMOTE_ANNOTATIONS_DERIVED_PATH)
+  end
+end
diff --git a/lib/twemoji/db/unicode/emoji.rb b/lib/twemoji/db/unicode/emoji.rb
@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+
+class Emoji
+  attr_reader :codepoints, :status, :unicode, :description
+
+  def initialize(data)
+    @codepoints = data[:codepoints].strip
+    @status = data[:status].strip
+    @unicode = data[:unicode]
+    @description = data[:description]
+  end
+
+  def to_h(group:, subgroup:, keywords:)
+    {
+      unicode: unicode,
+      codepoints: codepoints,
+      description: description,
+      keywords: keywords,
+      group: group,
+      subgroup: subgroup,
+      status: status,
+    }
+  end
+
+  def inspect
+    "#<Emoji #{@unicode}>"
+  end
+end
diff --git a/lib/twemoji/db/unicode/emoji_category.rb b/lib/twemoji/db/unicode/emoji_category.rb
@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+
+class EmojiCategory
+  attr_reader :name, :type, :emojis
+
+  def initialize(name, type:, emojis: [])
+    @name = name
+    @type = type
+    @emojis = emojis
+  end
+
+  def add(emoji)
+    @emojis << emoji
+  end
+
+  def inspect
+    "#<EmojiCategory type: #{@type}, name: #{@name}, emojis: #{@emojis.size} #<Emoji> objects>"
+  end
+end
diff --git a/lib/twemoji/db/unicode/emoji_test_parser.rb b/lib/twemoji/db/unicode/emoji_test_parser.rb
@@ -0,0 +1,113 @@
+# frozen_string_literal: true
+
+require_relative "emoji_category"
+require_relative "emoji"
+require_relative "../cldr/annotations"
+require_relative "../emoji_data_files"
+
+class EmojiTestParser
+  def self.parse(file = EmojiDataFiles.emoji_test_file)
+    new(file).parse
+  end
+
+  def initialize(file)
+    @file = file
+  end
+
+  def parse
+    emoji_categories = []
+    group = nil
+    subgroup = nil
+
+    IO.readlines(file).each do |line|
+      case
+      when line.start_with?("# group: ")
+        group = parse_group(line)
+        emoji_categories << group
+      when line.start_with?("# subgroup: ")
+        subgroup = parse_subgroup(line)
+        group.add(subgroup)
+      when comment?(line) || newline?(line)
+        next
+      else
+        emoji = parse_emoji(line)
+        subgroup.add(emoji)
+      end
+    end
+
+    flatten(emoji_categories)
+  end
+
+  private
+  attr_reader :file
+
+  def parse_name(line)
+    _, name = line.split(":", 2)
+    name.strip
+  end
+
+  def parse_group(line)
+    group_name = parse_name(line)
+    EmojiCategory.new(group_name, type: "group")
+  end
+
+  def parse_subgroup(line)
+    group_name = parse_name(line)
+    EmojiCategory.new(group_name, type: "subgroup")
+  end
+
+  def comment?(line)
+    line.start_with?("#")
+  end
+
+  def newline?(line)
+    line.strip.empty?
+  end
+
+  EMOJI_LINE_REGEXP = /(?<codepoints>.+);(?<status>.+)# (?<unicode>[^[\s]]+)\s(?<description>.+)/
+  private_constant :EMOJI_LINE_REGEXP
+
+  def parse_emoji(line)
+    matched = line.match(EMOJI_LINE_REGEXP)
+    Emoji.new(matched)
+  end
+
+  def flatten(data)
+    emojis = []
+
+    data.each do |emoji_category|
+      emoji_category.emojis.each do |emoji_subcategory|
+        emoji_subcategory.emojis.each do |raw_emoji|
+          annotation = get_annotation(raw_emoji.codepoints)
+
+          emoji = raw_emoji.to_h(
+            group: emoji_category.name,
+            subgroup: emoji_subcategory.name,
+            keywords: annotation.keywords,
+          )
+
+          emojis << emoji
+        end
+      end
+    end
+
+    emojis
+  end
+
+  class NullAnnotation
+    def keywords; []; end
+  end
+
+  def get_annotation(codepoints)
+    cldr_annotations.find_by(codepoints: codepoints) || NullAnnotation.new
+  end
+
+  def cldr_annotations
+    @cldr_annotations ||= begin
+      CLDR::Annotations.new(
+        EmojiDataFiles.annotations_file,
+        EmojiDataFiles.annotations_derived_file,
+      ).annotations
+    end
+  end
+end
diff --git a/twemoji.gemspec b/twemoji.gemspec
@@ -13,10 +13,15 @@ Gem::Specification.new do |spec|
   spec.description   = spec.summary
   spec.homepage      = "https://github.com/jollygoodcode/twemoji"
   spec.license       = "MIT"
-
-  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f =~ %r(^(test)/) }
   spec.require_paths = %w(lib)
 
+  spec.files         = Dir[
+    "README.md",
+    "LICENSE.md",
+    "lib/**/*.yml",
+    "lib/**/*.rb",
+  ]
+
   spec.required_ruby_version = "~> 2.0"
 
   spec.add_dependency "nokogiri", "~> 1.6"