diff --git a/Sources/Megrez/1_Compositor.swift b/Sources/Megrez/1_Compositor.swift index 4abd9d8..c6ca135 100644 --- a/Sources/Megrez/1_Compositor.swift +++ b/Sources/Megrez/1_Compositor.swift @@ -26,7 +26,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extension Megrez { /// 組字器。 public class Compositor: Grid { - /// 文字輸入方向 + /// 就文字輸入方向而言的方向。 public enum TypingDirection { case front, rear } /// 給被丟掉的節點路徑施加的負權重。 private let kDroppedPathScore: Double = -999 @@ -38,7 +38,14 @@ extension Megrez { private var langModel: LangModelProtocol /// 允許查詢當前游標位置屬於第幾個幅位座標(從 0 開始算)。 private(set) var cursorRegionMap: [Int: Int] = .init() - private(set) var walkedAnchors: [Megrez.NodeAnchor] = [] // 用以記錄爬過的節錨的陣列 + /// 用以記錄爬過的節錨的陣列。 + private(set) var walkedAnchors: [NodeAnchor] = [] + + /// 該函式用以更新爬過的節錨的陣列。 + /// - Parameter nodes: 傳入的節點陣列。 + public func updateWalkedAnchors(with nodes: [Node]) { + walkedAnchors = nodes.map { Megrez.NodeAnchor(node: $0) } + } /// 公開:多字讀音鍵當中用以分割漢字讀音的記號,預設為空。 public var joinSeparator: String = "-" @@ -47,7 +54,7 @@ extension Megrez { public var length: Int { readings.count } /// 按幅位來前後移動游標。 - /// - Parameter direction: 移動方向 + /// - Parameter direction: 移動方向。 /// - Returns: 該操作是否順利完成。 @discardableResult public func jumpCursorBySpan(to direction: TypingDirection) -> Bool { switch direction { @@ -64,8 +71,9 @@ extension Megrez { case currentRegionBorderRear: switch direction { case .front: - if currentRegion > walkedAnchors.count { cursor = readings.count } - else { cursor = walkedAnchors[0...currentRegion].map(\.spanLength).reduce(0, +) } + cursor = + (currentRegion > walkedAnchors.count) + ? readings.count : walkedAnchors[0...currentRegion].map(\.spanLength).reduce(0, +) case .rear: cursor = walkedAnchors[0.. $1.scoreForSort + $0.node.score > $1.node.score } guard !nodes.isEmpty else { return .init() } // 防止下文出現範圍外索引的錯誤 @@ -269,7 +277,7 @@ extension Megrez { if hasMatchedNode(location: p, spanLength: q, key: combinedReading) { continue } let unigrams: [Unigram] = langModel.unigramsFor(key: combinedReading) if unigrams.isEmpty { continue } - let n = Node(key: combinedReading, unigrams: unigrams) + let n: Node = .init(key: combinedReading, spanLength: q, unigrams: unigrams) insertNode(node: n, location: p, spanLength: q) } } @@ -281,6 +289,7 @@ extension Megrez { internal func updateCursorJumpingTables(_ anchors: [NodeAnchor]) { var cursorRegionMapDict = [Int: Int]() + cursorRegionMapDict[-1] = 0 // 防呆 var counter = 0 for (i, anchor) in anchors.enumerated() { for _ in 0.. NodeAnchor { let location = abs(location) // 防呆 var node = NodeAnchor() - for theAnchor in nodesOverlappedAt(location: location) { + for theAnchor in nodesCrossingOrEndingAt(location: location) { let candidates = theAnchor.node.candidates // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 theAnchor.node.resetCandidate() @@ -217,7 +199,7 @@ extension Megrez { @discardableResult public func fixNodeWithCandidate(_ pair: KeyValuePaired, at location: Int) -> NodeAnchor { let location = abs(location) // 防呆 var node = NodeAnchor() - for theAnchor in nodesOverlappedAt(location: location) { + for theAnchor in nodesCrossingOrEndingAt(location: location) { let candidates = theAnchor.node.candidates // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 theAnchor.node.resetCandidate() diff --git a/Sources/Megrez/3_NodeAnchor.swift b/Sources/Megrez/3_NodeAnchor.swift index 40c9c89..2ce8982 100644 --- a/Sources/Megrez/3_NodeAnchor.swift +++ b/Sources/Megrez/3_NodeAnchor.swift @@ -30,28 +30,35 @@ extension Megrez { public var isEmpty: Bool { node.key.isEmpty } /// 節點。一個節锚內不一定有節點。 public var node: Node = .init() - /// 節锚所在的位置。 - public var location: Int = 0 /// 指定的幅位長度。 - public var spanLength: Int = 0 + public var spanLength: Int { node.spanLength } + /// 獲取用來比較的權重。 + public var scoreForSort: Double { node.score } /// 累計權重。 public var mass: Double = 0.0 - /// 索引鍵的長度。 - public var keyLength: Int { - isEmpty ? node.key.count : 0 + /// 單元圖陣列。 + public var unigrams: [Unigram] { node.unigrams } + /// 雙元圖陣列。 + public var bigrams: [Bigram] { node.bigrams } + /// 鍵。 + public var key: String { node.key } + + /// 初期化一個節錨。 + public init(node: Node = .init(), mass: Double? = nil) { + self.node = node + self.mass = mass ?? self.node.score } + /// 將該節錨雜湊化。 public func hash(into hasher: inout Hasher) { hasher.combine(node) - hasher.combine(location) - hasher.combine(spanLength) hasher.combine(mass) } /// 將當前節锚列印成一個字串。 public var description: String { var stream = "" - stream += "{@(" + String(location) + "," + String(spanLength) + ")," + stream += "{@(" + String(spanLength) + ")," if node.key.isEmpty { stream += node.description } else { @@ -60,11 +67,6 @@ extension Megrez { stream += "}" return stream } - - /// 獲取用來比較的權重。 - public var scoreForSort: Double { - isEmpty ? node.score : 0 - } } } diff --git a/Sources/Megrez/4_Node.swift b/Sources/Megrez/4_Node.swift index f5fc0d6..af951a5 100644 --- a/Sources/Megrez/4_Node.swift +++ b/Sources/Megrez/4_Node.swift @@ -30,7 +30,7 @@ extension Megrez { lhs.key == rhs.key && lhs.score == rhs.score && lhs.unigrams == rhs.unigrams && lhs.bigrams == rhs.bigrams && lhs.candidates == rhs.candidates && lhs.valueUnigramIndexMap == rhs.valueUnigramIndexMap && lhs.precedingBigramMap == rhs.precedingBigramMap && lhs.isCandidateFixed == rhs.isCandidateFixed - && lhs.selectedUnigramIndex == rhs.selectedUnigramIndex + && lhs.selectedUnigramIndex == rhs.selectedUnigramIndex && lhs.spanLength == rhs.spanLength } public func hash(into hasher: inout Hasher) { @@ -38,6 +38,7 @@ extension Megrez { hasher.combine(score) hasher.combine(unigrams) hasher.combine(bigrams) + hasher.combine(spanLength) hasher.combine(candidates) hasher.combine(valueUnigramIndexMap) hasher.combine(precedingBigramMap) @@ -50,9 +51,11 @@ extension Megrez { /// 當前節點的當前被選中的候選字詞「在該節點內的」目前的權重。 private(set) var score: Double = 0 /// 單元圖陣列。 - private var unigrams: [Unigram] + private(set) var unigrams: [Unigram] /// 雙元圖陣列。 - private var bigrams: [Bigram] + private(set) var bigrams: [Bigram] + /// 指定的幅位長度。 + public var spanLength: Int = 0 /// 候選字詞陣列,以鍵值陣列的形式存在。 private(set) var candidates: [KeyValuePaired] = [] /// 專門「用單元圖資料值來調查索引值」的辭典。 @@ -83,10 +86,11 @@ extension Megrez { /// - key: 索引鍵。 /// - unigrams: 單元圖陣列。 /// - bigrams: 雙元圖陣列(非必填)。 - public init(key: String = "", unigrams: [Megrez.Unigram] = [], bigrams: [Megrez.Bigram] = []) { + public init(key: String = "", spanLength: Int = 0, unigrams: [Megrez.Unigram] = [], bigrams: [Megrez.Bigram] = []) { self.key = key self.unigrams = unigrams self.bigrams = bigrams + self.spanLength = spanLength self.unigrams.sort { $0.score > $1.score diff --git a/Sources/Megrez/5_LanguageModel.swift b/Sources/Megrez/5_LanguageModel.swift index 75ee404..c5dda60 100644 --- a/Sources/Megrez/5_LanguageModel.swift +++ b/Sources/Megrez/5_LanguageModel.swift @@ -28,7 +28,7 @@ public protocol LangModelProtocol { func unigramsFor(key: String) -> [Megrez.Unigram] /// 給定當前鍵與前述鍵,讓語言模型找給一組雙元圖陣列。 - func bigramsForKeys(precedingKey: String, key: String) -> [Megrez.Bigram] + func bigramsFor(precedingKey: String, key: String) -> [Megrez.Bigram] /// 給定鍵,確認是否有單元圖記錄在庫。 func hasUnigramsFor(key: String) -> Bool @@ -47,7 +47,7 @@ extension Megrez { } /// 給定當前鍵與前述鍵,讓語言模型找給一組雙元圖陣列。 - open func bigramsForKeys(precedingKey: String, key: String) -> [Megrez.Bigram] { + open func bigramsFor(precedingKey: String, key: String) -> [Megrez.Bigram] { precedingKey == key ? [Megrez.Bigram]() : [Megrez.Bigram]() } diff --git a/Tests/MegrezTests/LMDataForTests.swift b/Tests/MegrezTests/LMDataForTests.swift index 5ab38b5..674a675 100644 --- a/Tests/MegrezTests/LMDataForTests.swift +++ b/Tests/MegrezTests/LMDataForTests.swift @@ -28,7 +28,7 @@ import Megrez // MARK: - 用以測試的語言模型(簡單範本型) class SimpleLM: LangModelProtocol { - func bigramsForKeys(precedingKey _: String, key _: String) -> [Megrez.Bigram] { + func bigramsFor(precedingKey _: String, key _: String) -> [Megrez.Bigram] { .init() } @@ -65,7 +65,7 @@ class SimpleLM: LangModelProtocol { } class MockLM: LangModelProtocol { - func bigramsForKeys(precedingKey _: String, key _: String) -> [Megrez.Bigram] { + func bigramsFor(precedingKey _: String, key _: String) -> [Megrez.Bigram] { .init() } diff --git a/Tests/MegrezTests/MegrezTests.swift b/Tests/MegrezTests/MegrezTests.swift index f541d27..09e4360 100644 --- a/Tests/MegrezTests/MegrezTests.swift +++ b/Tests/MegrezTests/MegrezTests.swift @@ -85,7 +85,7 @@ final class MegrezTests: XCTestCase { func testInvalidOperations() throws { class TestLM: LangModelProtocol { - func bigramsForKeys(precedingKey _: String, key _: String) -> [Megrez.Bigram] { + func bigramsFor(precedingKey _: String, key _: String) -> [Megrez.Bigram] { .init() } @@ -334,24 +334,34 @@ final class MegrezTests: XCTestCase { let compositor = Megrez.Compositor(lm: SimpleLM(input: strSampleData)) compositor.joinSeparator = "" compositor.insertReading("gao1") + compositor.walk() compositor.insertReading("ji4") + compositor.walk() compositor.cursor = 1 compositor.insertReading("ke1") + compositor.walk() compositor.cursor = 0 compositor.dropReading(direction: .front) + compositor.walk() compositor.insertReading("gao1") + compositor.walk() compositor.cursor = compositor.length compositor.insertReading("gong1") + compositor.walk() compositor.insertReading("si1") + compositor.walk() compositor.insertReading("de5") + compositor.walk() compositor.insertReading("nian2") + compositor.walk() compositor.insertReading("zhong1") + compositor.walk() compositor.insertReading("jiang3") + compositor.walk() compositor.insertReading("jin1") compositor.walk() XCTAssertEqual(compositor.walkedAnchors.values, ["高科技", "公司", "的", "年中", "獎金"]) XCTAssertEqual(compositor.length, 10) - XCTAssert(!compositor.fixNodeWithCandidate(.init(key: "nian2zhong1", value: "年終"), at: 6).isEmpty) XCTAssert(!compositor.fixNodeWithCandidate(.init(key: "nian2zhong1", value: "年終"), at: 7).isEmpty) compositor.cursor = 8 XCTAssert(!compositor.fixNodeWithCandidate(.init(key: "nian2zhong1", value: "年終"), at: compositor.cursor).isEmpty) @@ -379,6 +389,8 @@ final class MegrezTests: XCTestCase { XCTAssertEqual(compositor.cursor, 10) XCTAssertFalse(compositor.jumpCursorBySpan(to: .front)) XCTAssertEqual(compositor.cursor, 10) + compositor.walk() + XCTAssertEqual(compositor.walkedAnchors.values, ["高科技", "公司", "的", "年終", "獎金"]) } func testOverrideOverlappingNodes() throws { @@ -449,11 +461,11 @@ final class MegrezTests: XCTestCase { var result = compositor.walk() XCTAssertEqual(result.values, ["高熱", "火焰", "危險", "蜜蜂"]) - compositor.fixNodeWithCandidate(.init(key: "huo3", value: "🔥"), at: 2) + compositor.fixNodeWithCandidate(.init(key: "huo3", value: "🔥"), at: 3) result = compositor.walk() XCTAssertEqual(result.values, ["高熱", "🔥", "焰", "危險", "蜜蜂"]) - compositor.fixNodeWithCandidate(.init(key: "huo3yan4", value: "🔥"), at: 3) + compositor.fixNodeWithCandidate(.init(key: "huo3yan4", value: "🔥"), at: 4) result = compositor.walk() XCTAssertEqual(result.values, ["高熱", "🔥", "危險", "蜜蜂"]) @@ -471,23 +483,22 @@ final class MegrezTests: XCTestCase { func testStressBenchmark_MachineGun() throws { // 測試結果發現:只敲入完全雷同的某個漢字的話,想保證使用體驗就得讓一個組字區最多塞 20 字。 // 但是呢,日常敲字都是在敲人話,不會出現這種情形,所以組字區內塞 40 字都沒問題。 - // 天權星引擎目前暫時沒有條件引入 Gramambular 2 的繁天頂(Vertex)算法,只能先這樣了。 // 竊以為「讓組字區內容無限擴張」是個偽需求,畢竟組字區太長了的話編輯起來也很麻煩。 - NSLog("// Stress test preparation begins.") + NSLog("// Normal walk: Machine-Gun Stress test preparation begins.") let compositor = Megrez.Compositor(lm: SimpleLM(input: strStressData)) for _ in 0..<20 { // 這個測試最多只能塞 20 字,否則會慢死。 compositor.insertReading("yi1") } - NSLog("// Stress test started.") + NSLog("// Normal walk: Machine-Gun Stress test started.") let startTime = CFAbsoluteTimeGetCurrent() - _ = compositor.walk() + compositor.walk() let timeElapsed = CFAbsoluteTimeGetCurrent() - startTime - NSLog("// Stress test elapsed: \(timeElapsed)s.") + NSLog("// Normal walk: Machine-Gun Stress test elapsed: \(timeElapsed)s.") } func testStressBenchmark_SpeakLikeAHuman() throws { // 與前一個測試相同,但這次測試的是正常人講話。可以看到在這種情況下目前的算法還是比較耐操的。 - NSLog("// Stress test preparation begins.") + NSLog("// Normal walk: Stress test preparation begins.") let compositor = Megrez.Compositor(lm: SimpleLM(input: strSampleData)) let testMaterial: [String] = ["gao1", "ke1", "ji4", "gong1", "si1", "de5", "nian2", "zhong1", "jiang3", "jin1"] for _ in 0..<114 { // 都敲出第一個野獸常數了,再不夠用就不像話了。 @@ -495,10 +506,10 @@ final class MegrezTests: XCTestCase { compositor.insertReading(neta) } } - NSLog("// Stress test started.") + NSLog("// Normal walk: Stress test started.") let startTime = CFAbsoluteTimeGetCurrent() - _ = compositor.walk() + compositor.walk() let timeElapsed = CFAbsoluteTimeGetCurrent() - startTime - NSLog("// Stress test elapsed: \(timeElapsed)s.") + NSLog("// Normal walk: Stress test elapsed: \(timeElapsed)s.") } }