Skip to content

Commit

Permalink
bin/rake data:dump to parse emoji-test.txt and output JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
JuanitoFatas committed Jun 1, 2019
1 parent ed64352 commit 2b21c9f
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 0 deletions.
6 changes: 6 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,10 @@ Rake::TestTask.new do |t|
t.verbose = true
end

namespace :data do
task :dump do
system "bin/parse-emoji-test", "vendor/Unicode-12.0/emoji-test.txt"
end
end

task default: :test
13 changes: 13 additions & 0 deletions bin/parse-emoji-test
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

# Parse https://unicode.org/Public/emoji/12.0/emoji-test.txt and output JSON.

# Exit cleanly from an early interrupt
Signal.trap("INT") { abort }

require "json"
require_relative "../data/emoji_test_parser"

parsed = EmojiTestParser.parse(ARGV[0])
puts JSON.pretty_generate(parsed)
28 changes: 28 additions & 0 deletions data/emoji.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# frozen_string_literal: true

class Emoji
def initialize(data)
@codepoints = data[:codepoints].strip
@status = data[:status].strip
@unicode = data[:unicode]
@description = data[:description]
end

def to_h(group:, subgroup:)
{
unicode: unicode,
codepoints: codepoints,
description: description,
group: group,
subgroup: subgroup,
status: status,
}
end

def inspect
"#<Emoji #{@unicode}>"
end

private
attr_reader :codepoints, :status, :unicode, :description
end
19 changes: 19 additions & 0 deletions data/emoji_category.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

class EmojiCategory
attr_reader :name, :type, :emojis

def initialize(name, type:, emojis: [])
@name = name
@type = type
@emojis = emojis
end

def add(emoji)
@emojis << emoji
end

def inspect
"#<EmojiCategory type: #{@type}, name: #{@name}, emojis: #{@emojis.size} #<Emoji> objects>"
end
end
91 changes: 91 additions & 0 deletions data/emoji_test_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# frozen_string_literal: true

require_relative "emoji_category"
require_relative "emoji"

# Parse https://unicode.org/Public/emoji/12.0/emoji-test.txt into Ruby.
class EmojiTestParser
def self.parse(file)
new(file).parse
end

def initialize(file)
@file = file
end

def parse
emoji_categories = []
group = nil
subgroup = nil

IO.readlines(file).each do |line|
case
when line.start_with?("# group: ")
group = parse_group(line)
emoji_categories << group
when line.start_with?("# subgroup: ")
subgroup = parse_subgroup(line)
group.add(subgroup)
when comment?(line) || newline?(line)
next
else
emoji = parse_emoji(line)
subgroup.add(emoji)
end
end

flatten(emoji_categories)
end

private
attr_reader :file

def parse_name(line)
_, name = line.split(":", 2)
name.strip
end

def parse_group(line)
group_name = parse_name(line)
EmojiCategory.new(group_name, type: "group")
end

def parse_subgroup(line)
group_name = parse_name(line)
EmojiCategory.new(group_name, type: "subgroup")
end

def comment?(line)
line.start_with?("#")
end

def newline?(line)
line.strip.empty?
end

EMOJI_LINE_REGEXP = /(?<codepoints>.+);(?<status>.+)# (?<unicode>[^[\s]]+)\s(?<description>.+)/
private_constant :EMOJI_LINE_REGEXP

def parse_emoji(line)
matched = line.match(EMOJI_LINE_REGEXP)
Emoji.new(matched)
end

def flatten(data)
emojis = []

data.each do |emoji_category|
emoji_category.emojis.each do |emoji_subcategory|
emoji_subcategory.emojis.each do |raw_emoji|
emoji = raw_emoji.to_h(
group: emoji_category.name,
subgroup: emoji_subcategory.name
)
emojis << emoji
end
end
end

emojis
end
end

0 comments on commit 2b21c9f

Please sign in to comment.