Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Support Twemoji v12.0 #50

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
*.o
*.a
mkmf.log
vendor/
1 change: 1 addition & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env rake
require "bundler/gem_tasks"
require "rake/testtask"
load "lib/tasks/db.rake"

Rake::TestTask.new do |t|
t.libs << "test"
Expand Down
42 changes: 42 additions & 0 deletions lib/tasks/db.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# frozen_string_literal: true

require "fileutils"
require "json"

require_relative "../twemoji/db/unicode/emoji_test_parser"

def curl_download(url, output_file)
puts "Downloading #{url} => #{output_file}"
directory = File.dirname(output_file)
FileUtils.mkdir_p(directory) if !File.exist?(directory)
system "curl", "-fsSL", url, "-o", output_file
end

file EmojiDataFiles.emoji_test_file do |task|
curl_download(EmojiDataFiles.emoji_test_url, task.name)
end

file EmojiDataFiles.annotations_file do |task|
curl_download(EmojiDataFiles.annotations_url, task.name)
end

file EmojiDataFiles.annotations_derived_file do |task|
curl_download(EmojiDataFiles.annotations_derived_url, task.name)
end

namespace :db do
desc %(Prepare data files needed for generating emojis.json)
task prepare_files: [
EmojiDataFiles.emoji_test_file,
EmojiDataFiles.annotations_file,
EmojiDataFiles.annotations_derived_file,
]

desc "Generate emojis.json to db folder"
task dump: :prepare_files do
emojis = EmojiTestParser.parse

puts JSON.pretty_generate(emojis)
puts "Parsed #{emojis.size} emojis from #{EmojiDataFiles.version}!"
end
end
56 changes: 56 additions & 0 deletions lib/twemoji/db/cldr/annotations.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# frozen_string_literal: true

require "nokogiri"
require_relative "../../utils/unicode"

require_relative "emoji_annotation"
require_relative "emoji_annotations"

module CLDR
class Annotations
attr_reader :annotations

def initialize(annotation, annotation_derived)
@annotations = parse_files(annotation, annotation_derived)
end

private

def parse_files(annotation, annotation_derived)
parse_file(annotation) + parse_file(annotation_derived)
end

def parse_file(path)
document = get_document(path)
parse(document)
end

def parse(document)
result = EmojiAnnotations.new

document.css("annotations annotation").each do |annotation_node|
next if annotation_node.attributes.key?("type")

codepoints = get_codepoints(annotation_node.attributes["cp"].text)
keywords = annotation_node.text.split(" | ")

emoji_annotation = EmojiAnnotation.new(
codepoints: codepoints,
keywords: keywords,
)

result.add(emoji_annotation)
end

result
end

def get_document(path)
Nokogiri::XML.parse(IO.read(path))
end

def get_codepoints(unicode)
Twemoji::Utils::Unicode.unpack(unicode, connector: " ").upcase
end
end
end
10 changes: 10 additions & 0 deletions lib/twemoji/db/cldr/emoji_annotation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# frozen_string_literal: true

class EmojiAnnotation
attr_reader :codepoints, :keywords

def initialize(codepoints:, keywords:)
@codepoints = codepoints
@keywords = keywords
end
end
23 changes: 23 additions & 0 deletions lib/twemoji/db/cldr/emoji_annotations.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# frozen_string_literal: true

class EmojiAnnotations
attr_reader :annotations

def initialize(annotations = [])
@annotations = annotations
end

def +(emoji_annotations)
self.class.new(
self.annotations + emoji_annotations.annotations
)
end

def add(annotation)
@annotations << annotation
end

def find_by(codepoints:)
annotations.find { |annotation| codepoints == annotation.codepoints }
end
end
40 changes: 40 additions & 0 deletions lib/twemoji/db/emoji_data_files.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# frozen_string_literal: true

class EmojiDataFiles
VERSION = "release-35-1"
VENDOR_DIR = File.expand_path("../../../vendor", __dir__)
UNICODE_REPO_ROOT = "https://raw.githubusercontent.com/unicode-org/cldr/#{VERSION}"
REMOTE_EMOJI_TEST_PATH = "tools/java/org/unicode/cldr/util/data/emoji/emoji-test.txt"
REMOTE_ANNOTATIONS_PATH = "common/annotations/en.xml"
REMOTE_ANNOTATIONS_DERIVED_PATH = "common/annotationsDerived/en.xml"
private_constant :VERSION, :VENDOR_DIR, :UNICODE_REPO_ROOT
private_constant :REMOTE_EMOJI_TEST_PATH, :REMOTE_ANNOTATIONS_PATH, :REMOTE_ANNOTATIONS_DERIVED_PATH

def self.version
"CLDR #{VERSION}"
end

def self.emoji_test_file
File.join(VENDOR_DIR, VERSION, "emoji-test.txt")
end

def self.annotations_file
File.join(VENDOR_DIR, VERSION, "annotations/en.xml")
end

def self.annotations_derived_file
File.join(VENDOR_DIR, VERSION, "annotationsDerived/en.xml")
end

def self.emoji_test_url
File.join(UNICODE_REPO_ROOT, REMOTE_EMOJI_TEST_PATH)
end

def self.annotations_url
File.join(UNICODE_REPO_ROOT, REMOTE_ANNOTATIONS_PATH)
end

def self.annotations_derived_url
File.join(UNICODE_REPO_ROOT, REMOTE_ANNOTATIONS_DERIVED_PATH)
end
end
28 changes: 28 additions & 0 deletions lib/twemoji/db/unicode/emoji.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# frozen_string_literal: true

class Emoji
attr_reader :codepoints, :status, :unicode, :description

def initialize(data)
@codepoints = data[:codepoints].strip
@status = data[:status].strip
@unicode = data[:unicode]
@description = data[:description]
end

def to_h(group:, subgroup:, keywords:)
{
unicode: unicode,
codepoints: codepoints,
description: description,
keywords: keywords,
group: group,
subgroup: subgroup,
status: status,
}
end

def inspect
"#<Emoji #{@unicode}>"
end
end
19 changes: 19 additions & 0 deletions lib/twemoji/db/unicode/emoji_category.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

class EmojiCategory
attr_reader :name, :type, :emojis

def initialize(name, type:, emojis: [])
@name = name
@type = type
@emojis = emojis
end

def add(emoji)
@emojis << emoji
end

def inspect
"#<EmojiCategory type: #{@type}, name: #{@name}, emojis: #{@emojis.size} #<Emoji> objects>"
end
end
113 changes: 113 additions & 0 deletions lib/twemoji/db/unicode/emoji_test_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# frozen_string_literal: true

require_relative "emoji_category"
require_relative "emoji"
require_relative "../cldr/annotations"
require_relative "../emoji_data_files"

class EmojiTestParser
def self.parse(file = EmojiDataFiles.emoji_test_file)
new(file).parse
end

def initialize(file)
@file = file
end

def parse
emoji_categories = []
group = nil
subgroup = nil

IO.readlines(file).each do |line|
case
when line.start_with?("# group: ")
group = parse_group(line)
emoji_categories << group
when line.start_with?("# subgroup: ")
subgroup = parse_subgroup(line)
group.add(subgroup)
when comment?(line) || newline?(line)
next
else
emoji = parse_emoji(line)
subgroup.add(emoji)
end
end

flatten(emoji_categories)
end

private
attr_reader :file

def parse_name(line)
_, name = line.split(":", 2)
name.strip
end

def parse_group(line)
group_name = parse_name(line)
EmojiCategory.new(group_name, type: "group")
end

def parse_subgroup(line)
group_name = parse_name(line)
EmojiCategory.new(group_name, type: "subgroup")
end

def comment?(line)
line.start_with?("#")
end

def newline?(line)
line.strip.empty?
end

EMOJI_LINE_REGEXP = /(?<codepoints>.+);(?<status>.+)# (?<unicode>[^[\s]]+)\s(?<description>.+)/
private_constant :EMOJI_LINE_REGEXP

def parse_emoji(line)
matched = line.match(EMOJI_LINE_REGEXP)
Emoji.new(matched)
end

def flatten(data)
emojis = []

data.each do |emoji_category|
emoji_category.emojis.each do |emoji_subcategory|
emoji_subcategory.emojis.each do |raw_emoji|
annotation = get_annotation(raw_emoji.codepoints)

emoji = raw_emoji.to_h(
group: emoji_category.name,
subgroup: emoji_subcategory.name,
keywords: annotation.keywords,
)

emojis << emoji
end
end
end

emojis
end

class NullAnnotation
def keywords; []; end
end

def get_annotation(codepoints)
cldr_annotations.find_by(codepoints: codepoints) || NullAnnotation.new
end

def cldr_annotations
@cldr_annotations ||= begin
CLDR::Annotations.new(
EmojiDataFiles.annotations_file,
EmojiDataFiles.annotations_derived_file,
).annotations
end
end
end
9 changes: 7 additions & 2 deletions twemoji.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,15 @@ Gem::Specification.new do |spec|
spec.description = spec.summary
spec.homepage = "https://github.com/jollygoodcode/twemoji"
spec.license = "MIT"

spec.files = `git ls-files -z`.split("\x0").reject { |f| f =~ %r(^(test)/) }
spec.require_paths = %w(lib)

spec.files = Dir[
"README.md",
"LICENSE.md",
"lib/**/*.yml",
"lib/**/*.rb",
]

spec.required_ruby_version = "~> 2.0"

spec.add_dependency "nokogiri", "~> 1.6"
Expand Down