From c8ee945e30678b1a34cd011f63bfea0215735b7f Mon Sep 17 00:00:00 2001 From: Aliaksandr Dziarkach <18146690+AliaksandrDziarkach@users.noreply.github.com> Date: Mon, 11 Nov 2024 18:15:52 +0300 Subject: [PATCH] #2617 - Line breaks should be ignored for three letter sequence codes Fix code. Add UTs. --- .../ref/formats/seq3letter_to_ket.py.out | 1 + .../ref/peptides_3letter_line_break.ket | 895 ++++++++++++++++++ .../ref/peptides_3letter_line_break.seq3 | 5 + .../tests/formats/seq3letter_to_ket.py | 5 +- .../molecule/src/sequence_loader.cpp | 5 +- 5 files changed, 909 insertions(+), 2 deletions(-) create mode 100644 api/tests/integration/tests/formats/ref/peptides_3letter_line_break.ket create mode 100644 api/tests/integration/tests/formats/ref/peptides_3letter_line_break.seq3 diff --git a/api/tests/integration/ref/formats/seq3letter_to_ket.py.out b/api/tests/integration/ref/formats/seq3letter_to_ket.py.out index e572c41a13..c02a96422f 100644 --- a/api/tests/integration/ref/formats/seq3letter_to_ket.py.out +++ b/api/tests/integration/ref/formats/seq3letter_to_ket.py.out @@ -1,5 +1,6 @@ *** 3 LETTER SEQUENCE to KET *** peptides_3letter : SUCCEED +peptides_3letter_line_break : SUCCEED Test 'ALA': got expected error 'Given string cannot be interpreted as a valid three letter sequence because of incorrect formatting.' Test 'Al a': got expected error 'Given string cannot be interpreted as a valid three letter sequence because of incorrect formatting.' Test 'ala': got expected error 'Given string cannot be interpreted as a valid three letter sequence because of incorrect formatting.' diff --git a/api/tests/integration/tests/formats/ref/peptides_3letter_line_break.ket b/api/tests/integration/tests/formats/ref/peptides_3letter_line_break.ket new file mode 100644 index 0000000000..a547de82bc --- /dev/null +++ b/api/tests/integration/tests/formats/ref/peptides_3letter_line_break.ket @@ -0,0 +1,895 @@ +{ + "root": { + "nodes": [ + { + "$ref": "monomer0" + }, + { + "$ref": "monomer1" + }, + { + "$ref": "monomer2" + }, + { + "$ref": "monomer3" + } + ], + "connections": [ + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer0", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer1", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer1", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer2", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer2", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer3", + "attachmentPointId": "R1" + } + } + ], + "templates": [ + { + "$ref": "monomerTemplate-A___Alanine" + }, + { + "$ref": "monomerTemplate-R___Arginine" + }, + { + "$ref": "monomerTemplate-N___Asparagine" + }, + { + "$ref": "monomerTemplate-D___Aspartic acid" + } + ] + }, + "monomer0": { + "type": "monomer", + "id": "0", + "seqid": 1, + "position": { + "x": 0.000000, + "y": -0.000000 + }, + "alias": "A", + "templateId": "A___Alanine" + }, + "monomer1": { + "type": "monomer", + "id": "1", + "seqid": 2, + "position": { + "x": 1.600000, + "y": -0.000000 + }, + "alias": "R", + "templateId": "R___Arginine" + }, + "monomer2": { + "type": "monomer", + "id": "2", + "seqid": 3, + "position": { + "x": 3.200000, + "y": -0.000000 + }, + "alias": "N", + "templateId": "N___Asparagine" + }, + "monomer3": { + "type": "monomer", + "id": "3", + "seqid": 4, + "position": { + "x": 4.800000, + "y": -0.000000 + }, + "alias": "D", + "templateId": "D___Aspartic acid" + }, + "monomerTemplate-A___Alanine": { + "type": "monomerTemplate", + "id": "A___Alanine", + "class": "AminoAcid", + "classHELM": "PEPTIDE", + "fullName": "Alanine", + "alias": "A", + "naturalAnalogShort": "A", + "attachmentPoints": [ + { + "attachmentAtom": 0, + "type": "left", + "leavingGroup": { + "atoms": [ + 6 + ] + } + }, + { + "attachmentAtom": 3, + "type": "right", + "leavingGroup": { + "atoms": [ + 5 + ] + } + } + ], + "atoms": [ + { + "label": "N", + "location": [ + -1.254900, + -0.392000, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.272000, + 0.263300, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + -0.310300, + 1.739300, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 1.052300, + -0.392000, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.082900, + -1.572200, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 2.035300, + 0.263300, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -2.333400, + 0.090500, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 1, + 0 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 1, + 3 + ] + }, + { + "type": 2, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 6 + ] + } + ] + }, + "monomerTemplate-R___Arginine": { + "type": "monomerTemplate", + "id": "R___Arginine", + "class": "AminoAcid", + "classHELM": "PEPTIDE", + "fullName": "Arginine", + "alias": "R", + "naturalAnalogShort": "R", + "attachmentPoints": [ + { + "attachmentAtom": 3, + "type": "left", + "leavingGroup": { + "atoms": [ + 4 + ] + } + }, + { + "attachmentAtom": 0, + "type": "right", + "leavingGroup": { + "atoms": [ + 12 + ] + } + }, + { + "attachmentAtom": 10, + "type": "side", + "leavingGroup": { + "atoms": [ + 13 + ] + } + } + ], + "atoms": [ + { + "label": "C", + "location": [ + 1.771800, + -2.589100, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.773200, + -3.533700, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.748300, + -1.999400, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "N", + "location": [ + -0.275200, + -2.589100, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -1.093200, + -2.116800, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.746400, + -0.818200, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.277100, + -0.228400, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.278900, + 0.952900, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -1.302400, + 1.542600, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -1.304200, + 2.723800, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -0.486800, + 3.197100, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -2.122700, + 3.195500, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 2.589200, + -2.115900, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -0.488300, + 4.378600, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 2, + "atoms": [ + 1, + 0 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 5 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 5, + 6 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 7 + ] + }, + { + "type": 1, + "atoms": [ + 7, + 8 + ] + }, + { + "type": 1, + "atoms": [ + 8, + 9 + ] + }, + { + "type": 1, + "atoms": [ + 9, + 10 + ] + }, + { + "type": 2, + "atoms": [ + 9, + 11 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 12 + ] + }, + { + "type": 1, + "atoms": [ + 10, + 13 + ] + } + ] + }, + "monomerTemplate-N___Asparagine": { + "type": "monomerTemplate", + "id": "N___Asparagine", + "class": "AminoAcid", + "classHELM": "PEPTIDE", + "fullName": "Asparagine", + "alias": "N", + "naturalAnalogShort": "N", + "attachmentPoints": [ + { + "attachmentAtom": 3, + "type": "left", + "leavingGroup": { + "atoms": [ + 4 + ] + } + }, + { + "attachmentAtom": 0, + "type": "right", + "leavingGroup": { + "atoms": [ + 9 + ] + } + }, + { + "attachmentAtom": 7, + "type": "side", + "leavingGroup": { + "atoms": [ + 10 + ] + } + } + ], + "atoms": [ + { + "label": "C", + "location": [ + 1.892900, + -1.417500, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.894700, + -2.598900, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.612700, + -0.679900, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "N", + "location": [ + -0.667600, + -1.417500, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -1.690700, + -0.826600, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.610400, + 0.797800, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.669800, + 1.535400, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -1.692200, + 0.943400, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + -0.671600, + 2.716800, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 2.915300, + -0.825500, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -2.534100, + 1.772400, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 2, + "atoms": [ + 1, + 0 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 5 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 5, + 6 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 7 + ] + }, + { + "type": 2, + "atoms": [ + 6, + 8 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 9 + ] + }, + { + "type": 1, + "atoms": [ + 7, + 10 + ] + } + ] + }, + "monomerTemplate-D___Aspartic acid": { + "type": "monomerTemplate", + "id": "D___Aspartic acid", + "class": "AminoAcid", + "classHELM": "PEPTIDE", + "fullName": "Aspartic acid", + "alias": "D", + "naturalAnalogShort": "D", + "attachmentPoints": [ + { + "attachmentAtom": 3, + "type": "left", + "leavingGroup": { + "atoms": [ + 4 + ] + } + }, + { + "attachmentAtom": 0, + "type": "right", + "leavingGroup": { + "atoms": [ + 9 + ] + } + }, + { + "attachmentAtom": 8, + "type": "side", + "leavingGroup": { + "atoms": [ + 10 + ] + } + } + ], + "atoms": [ + { + "label": "C", + "location": [ + 1.631000, + -1.557800, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 1.632700, + -2.739200, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.350700, + -0.820100, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "N", + "location": [ + -0.929500, + -1.557800, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -1.952500, + -0.966900, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + 0.348500, + 0.657500, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.931700, + 1.395200, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + -1.954200, + 0.803200, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + -0.933500, + 2.576600, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 2.653400, + -0.965800, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + 0.085100, + 3.175100, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 2, + "atoms": [ + 1, + 0 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 5 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 5, + 6 + ] + }, + { + "type": 2, + "atoms": [ + 6, + 7 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 8 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 9 + ] + }, + { + "type": 1, + "atoms": [ + 8, + 10 + ] + } + ] + } +} \ No newline at end of file diff --git a/api/tests/integration/tests/formats/ref/peptides_3letter_line_break.seq3 b/api/tests/integration/tests/formats/ref/peptides_3letter_line_break.seq3 new file mode 100644 index 0000000000..1f0e081627 --- /dev/null +++ b/api/tests/integration/tests/formats/ref/peptides_3letter_line_break.seq3 @@ -0,0 +1,5 @@ +Ala +Ar +gAs +nA +sp \ No newline at end of file diff --git a/api/tests/integration/tests/formats/seq3letter_to_ket.py b/api/tests/integration/tests/formats/seq3letter_to_ket.py index 51f7f7b078..8a33a7e1fa 100644 --- a/api/tests/integration/tests/formats/seq3letter_to_ket.py +++ b/api/tests/integration/tests/formats/seq3letter_to_ket.py @@ -27,7 +27,10 @@ def find_diff(a, b): ref_path = joinPathPy("ref/", __file__) -files = ["peptides_3letter"] +files = [ + "peptides_3letter", + "peptides_3letter_line_break", +] lib = indigo.loadMonomerLibraryFromFile( diff --git a/core/indigo-core/molecule/src/sequence_loader.cpp b/core/indigo-core/molecule/src/sequence_loader.cpp index 6e2ae4d0ff..cc47098707 100644 --- a/core/indigo-core/molecule/src/sequence_loader.cpp +++ b/core/indigo-core/molecule/src/sequence_loader.cpp @@ -2033,11 +2033,14 @@ void SequenceLoader::load3LetterSequence(KetDocument& document) if (!std::isalpha(ch) || !std::isupper(ch)) throw Error(wrong_format); std::string monomer(1, ch); - for (auto i = 0; i < 2; i++) // read two chars + for (auto i = 0; i < 2;) // read two chars { if (_scanner.isEOF()) throw Error(wrong_format); ch = _scanner.readChar(); + if (ch == '\n' || ch == '\r') + continue; + i++; if (!std::isalpha(ch) || !std::islower(ch)) throw Error(wrong_format); monomer += ch;