From fcdebb7123f87e03ba63973abe4533e13ba7c827 Mon Sep 17 00:00:00 2001
From: yash-puligundla <yasasvini.puligundla@ga4gh.org>
Date: Thu, 2 Mar 2023 16:11:16 -0500
Subject: [PATCH] Add NameTokenization Decoder

---
 .../NameTokenisationDecode.java               | 146 ++++++++++++++++++
 .../nametokenisation/TokenStreams.java        | 107 +++++++++++++
 .../nametokenisation/tokens/Token.java        |  17 ++
 .../compression/nametokenisation/utils.java   |  17 ++
 .../cram/NameTokenizationInteropTest.java     |  87 +++++++++++
 5 files changed, 374 insertions(+)
 create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java
 create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java
 create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/Token.java
 create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/utils.java
 create mode 100644 src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java
new file mode 100644
index 0000000000..54479f1bd5
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java
@@ -0,0 +1,146 @@
+package htsjdk.samtools.cram.compression.nametokenisation;
+
+import htsjdk.samtools.cram.CRAMException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.List;
+
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_TYPE;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_STRING;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_CHAR;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS0;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DZLEN;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DUP;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA0;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_MATCH;
+import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_END;
+
+public class NameTokenisationDecode {
+
+
+    public static String uncompress(final ByteBuffer inBuffer) {
+        return uncompress(inBuffer, "\n");
+    }
+
+    public static String uncompress(final ByteBuffer inBuffer, final String separator) {
+        inBuffer.order(ByteOrder.LITTLE_ENDIAN);
+        final int uncompressedLength =  inBuffer.getInt() & 0xFFFFFFFF; //unused variable. Following the spec
+        final int numNames =  inBuffer.getInt() & 0xFFFFFFFF;
+        final int useArith = inBuffer.get() & 0xFF;
+        TokenStreams tokenStreams = new TokenStreams(inBuffer, useArith, numNames);
+        List<List<String>> tokensList = new ArrayList<List<String>>(numNames);
+        for(int i = 0; i < numNames; i++) {
+            tokensList.add(new ArrayList<>());
+        }
+        String decodedNamesString = "";
+        for (int i = 0; i< numNames; i++){
+            decodedNamesString += decodeSingleName(tokenStreams, tokensList, i) + separator;
+        }
+        return decodedNamesString;
+    }
+
+    private static String decodeSingleName(final TokenStreams tokenStreams,
+                                           final List<List<String>> tokensList,
+                                           final int currentNameIndex) {
+        int type = tokenStreams.getTokenStreamBuffer(0,TOKEN_TYPE).get() & 0xFF;
+        final ByteBuffer distBuffer = tokenStreams.getTokenStreamBuffer(0,type).order(ByteOrder.LITTLE_ENDIAN);
+        final int dist = distBuffer.getInt() & 0xFFFFFFFF;
+        final int prevNameIndex = currentNameIndex - dist;
+        if (type == TOKEN_DUP){
+            tokensList.add(currentNameIndex, tokensList.get(prevNameIndex));
+            return String.join("", tokensList.get(currentNameIndex));
+        }
+        int tokenPosition = 1;
+        do {
+            type = tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_TYPE).get() & 0xFF;
+            String currentToken = "";
+            switch(type){
+                case TOKEN_CHAR:
+                    char currentTokenChar = (char) tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_CHAR).get();
+                    currentToken = String.valueOf(currentTokenChar);
+                    break;
+                case TOKEN_STRING:
+                    currentToken = readString(tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_STRING));
+                    break;
+                case TOKEN_DIGITS:
+                    currentToken = getDigitsToken(tokenStreams, tokenPosition, TOKEN_DIGITS);
+                    break;
+                case TOKEN_DIGITS0:
+                    String digits0Token = getDigitsToken(tokenStreams, tokenPosition, TOKEN_DIGITS0);
+                    int lenDigits0Token = tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_DZLEN).get() & 0xFF;
+                    currentToken = leftPadNumber(digits0Token, lenDigits0Token);
+                    break;
+                case TOKEN_DELTA:
+                    currentToken = getDeltaToken(tokenStreams, tokenPosition, tokensList, prevNameIndex, TOKEN_DELTA);
+                    break;
+                case TOKEN_DELTA0:
+                    String delta0Token = getDeltaToken(tokenStreams, tokenPosition, tokensList, prevNameIndex, TOKEN_DELTA0);
+                    final int lenDelta0Token = tokensList.get(prevNameIndex).get(tokenPosition-1).length();
+                    currentToken = leftPadNumber(delta0Token, lenDelta0Token);
+                    break;
+                case TOKEN_MATCH:
+                    currentToken = tokensList.get(prevNameIndex).get(tokenPosition-1);
+                    break;
+                default:
+                    break;
+            }
+            tokensList.get(currentNameIndex).add(tokenPosition-1,currentToken);
+            tokenPosition++;
+        } while (type!= TOKEN_END);
+        return String.join("", tokensList.get(currentNameIndex));
+        }
+
+    private static String getDeltaToken(
+            final TokenStreams tokenStreams,
+            final int tokenPosition,
+            final List<List<String>> tokensList,
+            final int prevNameIndex,
+            final int tokenType) {
+        if (!(tokenType == TOKEN_DELTA || tokenType == TOKEN_DELTA0)){
+            throw new CRAMException(String.format("Invalid tokenType : %s. tokenType must be either TOKEN_DELTA or TOKEN_DELTA0", tokenType));
+        }
+        int prevToken;
+        try {
+            prevToken = Integer.parseInt(tokensList.get(prevNameIndex).get(tokenPosition -1));
+        } catch (NumberFormatException e) {
+            String exceptionMessageSubstring = (tokenType == TOKEN_DELTA) ? "DIGITS or DELTA" : "DIGITS0 or DELTA0";
+            throw new CRAMException(String.format("The token in the prior name must be of type %s", exceptionMessageSubstring), e);
+        }
+        final int deltaTokenValue = tokenStreams.getTokenStreamBuffer(tokenPosition,tokenType).get() & 0xFF;
+        return Long.toString(prevToken + deltaTokenValue);
+    }
+
+    private static String getDigitsToken(final TokenStreams tokenStreams, final int tokenPosition, final int tokenType ) {
+        if (!(tokenType == TOKEN_DIGITS || tokenType == TOKEN_DIGITS0)){
+            throw new CRAMException(String.format("Invalid tokenType : %s. tokenType must be either TOKEN_DIGITS or TOKEN_DIGITS0", tokenType));
+        }
+        ByteBuffer digitsByteBuffer = tokenStreams.getTokenStreamBuffer(tokenPosition, tokenType);
+        digitsByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
+        long digits = digitsByteBuffer.getInt() & 0xFFFFFFFFL;
+        return Long.toString(digits);
+    }
+
+    private static String readString(ByteBuffer inputBuffer) {
+        // spec: We fetch one byte at a time from the value byte stream,
+        // appending to the name buffer until the byte retrieved is zero.
+        StringBuilder sb = new StringBuilder();
+        byte b = inputBuffer.get();
+        while (b != 0) {
+            sb.append((char) b);
+            b = inputBuffer.get();
+        }
+        return sb.toString();
+    }
+
+    private static String leftPadNumber(String value, int len) {
+        // return value such that it is at least len bytes long with leading zeros
+        while (value.length() < len) {
+            value = "0" + value;
+        }
+        return value;
+    }
+
+}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java
new file mode 100644
index 0000000000..51e497fb26
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java
@@ -0,0 +1,107 @@
+package htsjdk.samtools.cram.compression.nametokenisation;
+
+import htsjdk.samtools.cram.CRAMException;
+import htsjdk.samtools.cram.compression.nametokenisation.tokens.Token;
+import htsjdk.samtools.cram.compression.range.RangeDecode;
+import htsjdk.samtools.cram.compression.rans.RANSDecode;
+import htsjdk.samtools.cram.compression.rans.Utils;
+import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+public class TokenStreams {
+
+    public static final int TOKEN_TYPE = 0;
+    public static final int TOKEN_STRING  = 1;
+    public static final int TOKEN_CHAR = 2;
+    public static final int TOKEN_DIGITS0 = 3;
+    public static final int TOKEN_DZLEN = 4;
+    public static final int TOKEN_DUP = 5;
+    public static final int TOKEN_DIGITS = 7;
+    public static final int TOKEN_DELTA = 8;
+    public static final int TOKEN_DELTA0 = 9;
+    public static final int TOKEN_MATCH = 10;
+    public static final int TOKEN_END = 12;
+
+    private static final int TOTAL_TOKEN_TYPES = 13;
+    private static final int NEW_TOKEN_FLAG_MASK = 0x80;
+    private static final int DUP_TOKEN_FLAG_MASK = 0x40;
+    private static final int TYPE_TOKEN_FLAG_MASK = 0x3F;
+
+    private final List<List<Token>> tokenStreams;
+
+    public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final int numNames) {
+        // TokenStreams is a List of List of Tokens.
+        // The outer index corresponds to the type of the token
+        // and the inner index corresponds to the index of the current Name in the list of Names
+        tokenStreams = new ArrayList<>(TOTAL_TOKEN_TYPES);
+        for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) {
+            tokenStreams.add(new ArrayList<>());
+        }
+        int tokenPosition = -1;
+        while (inputByteBuffer.hasRemaining()) {
+            final int tokenTypeFlags = inputByteBuffer.get() & 0xFF;
+            final boolean isNewToken = ((tokenTypeFlags & NEW_TOKEN_FLAG_MASK) != 0);
+            final boolean isDupToken = ((tokenTypeFlags & DUP_TOKEN_FLAG_MASK) != 0);
+            final int tokenType = (tokenTypeFlags & TYPE_TOKEN_FLAG_MASK);
+            if (tokenType < 0 || tokenType > 13) {
+                throw new CRAMException("Invalid Token tokenType: " + tokenType);
+            }
+            if (isNewToken) {
+                tokenPosition++;
+                if (tokenPosition > 0) {
+                    // If newToken and not the first newToken
+                    for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) {
+                        final List<Token> tokenStream = tokenStreams.get(i);
+                        if (tokenStream.size() < tokenPosition) {
+                            tokenStream.add(new Token(ByteBuffer.allocate(0)));
+                        }
+                        if (tokenStream.size() < tokenPosition) {
+                            throw new CRAMException("TokenStream is missing Token(s) at Token Type: " + i);
+                        }
+                    }
+                }
+            }
+
+            if ((isNewToken) && (tokenType != TOKEN_TYPE)) {
+
+                // Spec: if we have a byte stream B5,DIGIT S but no B5,T Y P E
+                // then we assume the contents of B5,T Y P E consist of one DIGITS tokenType
+                // followed by as many MATCH types as are needed.
+                final ByteBuffer typeDataByteBuffer = ByteBuffer.allocate(numNames);
+                for (int i = 0; i < numNames; i++) {
+                    typeDataByteBuffer.put((byte) TOKEN_MATCH);
+                }
+                typeDataByteBuffer.rewind();
+                typeDataByteBuffer.put(0, (byte) tokenType);
+                tokenStreams.get(0).add(new Token(typeDataByteBuffer));
+            }
+            if (isDupToken) {
+                final int dupPosition = inputByteBuffer.get() & 0xFF;
+                final int dupType = inputByteBuffer.get() & 0xFF;
+                final Token dupTokenStream = new Token(tokenStreams.get(dupType).get(dupPosition).getByteBuffer().duplicate());
+                tokenStreams.get(tokenType).add(tokenPosition,dupTokenStream);
+            } else {
+                final int clen = Utils.readUint7(inputByteBuffer);
+                final byte[] dataBytes = new byte[clen];
+                inputByteBuffer.get(dataBytes, 0, clen); // offset in the dst byte array
+                final ByteBuffer uncompressedDataByteBuffer;
+                if (useArith != 0) {
+                    RangeDecode rangeDecode = new RangeDecode();
+                    uncompressedDataByteBuffer = rangeDecode.uncompress(ByteBuffer.wrap(dataBytes));
+
+                } else {
+                    RANSDecode ransdecode = new RANSNx16Decode();
+                    uncompressedDataByteBuffer = ransdecode.uncompress(ByteBuffer.wrap(dataBytes));
+                }
+                tokenStreams.get(tokenType).add(tokenPosition,new Token(uncompressedDataByteBuffer));
+            }
+        }
+    }
+
+    public ByteBuffer getTokenStreamBuffer(final int position, final int type) {
+        return tokenStreams.get(type).get(position).getByteBuffer();
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/Token.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/Token.java
new file mode 100644
index 0000000000..dbff2195c3
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/Token.java
@@ -0,0 +1,17 @@
+package htsjdk.samtools.cram.compression.nametokenisation.tokens;
+
+import java.nio.ByteBuffer;
+
+public class Token {
+
+    private final ByteBuffer byteBuffer;
+
+    public Token(ByteBuffer inputByteBuffer) {
+        byteBuffer = inputByteBuffer;
+    }
+
+    public ByteBuffer getByteBuffer() {
+        return byteBuffer;
+    }
+
+}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/utils.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/utils.java
new file mode 100644
index 0000000000..727c0c96c2
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/utils.java
@@ -0,0 +1,17 @@
+package htsjdk.samtools.cram.compression.nametokenisation;
+
+import java.nio.ByteBuffer;
+
+public class utils {
+
+    public static int readUint7(ByteBuffer cp) {
+        int i = 0;
+        int c;
+        do {
+            //read byte
+            c = cp.get();
+            i = (i << 7) | (c & 0x7f);
+        } while ((c & 0x80) != 0);
+        return i;
+    }
+}
\ No newline at end of file
diff --git a/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java
new file mode 100644
index 0000000000..287510aabd
--- /dev/null
+++ b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java
@@ -0,0 +1,87 @@
+package htsjdk.samtools.cram;
+
+import htsjdk.HtsjdkTest;
+import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationDecode;
+import org.apache.commons.compress.utils.IOUtils;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+public class NameTokenizationInteropTest extends HtsjdkTest {
+    public static final String COMPRESSED_TOK_DIR = "tok3";
+
+    @DataProvider(name = "allNameTokenizationFiles")
+    public Object[][] getAllRansCodecsForRoundTrip() throws IOException {
+
+        // params:
+        // compressed testfile path, uncompressed testfile path, NameTokenization decoder,
+        final List<Object[]> testCases = new ArrayList<>();
+        for (Path path : getInteropNameTokenizationCompressedFiles()) {
+                Object[] objects = new Object[]{
+                        path,
+                        getNameTokenizationUnCompressedFilePath(path),
+                        new NameTokenisationDecode()
+                };
+                testCases.add(objects);
+        }
+        return testCases.toArray(new Object[][]{});
+
+    }
+
+    @Test(
+            dataProvider = "allNameTokenizationFiles",
+            description = "Uncompress the existing compressed file using htsjdk NameTokenization " +
+                    "and compare it with the original file.")
+    public void testNameTokenizationDecoder(
+            final Path compressedFilePath,
+            final Path uncompressedFilePath,
+            final NameTokenisationDecode nameTokenisationDecode) throws IOException {
+        final InputStream preCompressedInteropStream = Files.newInputStream(compressedFilePath);
+        final ByteBuffer preCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(preCompressedInteropStream));
+        final InputStream unCompressedInteropStream = Files.newInputStream(uncompressedFilePath);
+        final ByteBuffer unCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream));
+        String decompressedNames = nameTokenisationDecode.uncompress(preCompressedInteropBytes);
+        ByteBuffer decompressedNamesBuffer = StandardCharsets.UTF_8.encode(decompressedNames);
+        Assert.assertEquals(decompressedNamesBuffer,unCompressedInteropBytes);
+    }
+
+    // return a list of all NameTokenization encoded test data files in the htscodecs/tests/names/tok3 directory
+    private List<Path> getInteropNameTokenizationCompressedFiles() throws IOException {
+        final List<Path> paths = new ArrayList<>();
+        Files.newDirectoryStream(
+                CRAMInteropTestUtils.getInteropTestDataLocation().resolve("names/"+COMPRESSED_TOK_DIR),
+                        path -> Files.isRegularFile(path))
+                .forEach(path -> paths.add(path));
+        return paths;
+    }
+
+    // Given a compressed test file path, return the corresponding uncompressed file path
+    public static final Path getNameTokenizationUnCompressedFilePath(final Path compressedInteropPath) {
+        String uncompressedFileName = getUncompressedFileName(compressedInteropPath.getFileName().toString());
+        // Example compressedInteropPath: ../names/tok3/01.names.1 => unCompressedFilePath: ../names/01.names
+        return compressedInteropPath.getParent().getParent().resolve(uncompressedFileName);
+    }
+
+    public static final String getUncompressedFileName(final String compressedFileName) {
+        // Returns original filename from compressed file name
+        int lastDotIndex = compressedFileName.lastIndexOf(".");
+        if (lastDotIndex >= 0) {
+            String fileName = compressedFileName.substring(0, lastDotIndex);
+            return fileName;
+        } else {
+            throw new CRAMException("The format of the compressed File Name is not as expected. " +
+                    "The name of the compressed file should contain a perios followed by a number that" +
+                    "indicates type of compression. Actual compressed file name = "+ compressedFileName);
+        }
+    }
+
+}
\ No newline at end of file