From fcdebb7123f87e03ba63973abe4533e13ba7c827 Mon Sep 17 00:00:00 2001 From: yash-puligundla Date: Thu, 2 Mar 2023 16:11:16 -0500 Subject: [PATCH] Add NameTokenization Decoder --- .../NameTokenisationDecode.java | 146 ++++++++++++++++++ .../nametokenisation/TokenStreams.java | 107 +++++++++++++ .../nametokenisation/tokens/Token.java | 17 ++ .../compression/nametokenisation/utils.java | 17 ++ .../cram/NameTokenizationInteropTest.java | 87 +++++++++++ 5 files changed, 374 insertions(+) create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/Token.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/utils.java create mode 100644 src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java new file mode 100644 index 0000000000..54479f1bd5 --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java @@ -0,0 +1,146 @@ +package htsjdk.samtools.cram.compression.nametokenisation; + +import htsjdk.samtools.cram.CRAMException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; + +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_TYPE; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_STRING; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_CHAR; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS0; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DZLEN; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DUP; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA0; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_MATCH; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_END; + +public class NameTokenisationDecode { + + + public static String uncompress(final ByteBuffer inBuffer) { + return uncompress(inBuffer, "\n"); + } + + public static String uncompress(final ByteBuffer inBuffer, final String separator) { + inBuffer.order(ByteOrder.LITTLE_ENDIAN); + final int uncompressedLength = inBuffer.getInt() & 0xFFFFFFFF; //unused variable. Following the spec + final int numNames = inBuffer.getInt() & 0xFFFFFFFF; + final int useArith = inBuffer.get() & 0xFF; + TokenStreams tokenStreams = new TokenStreams(inBuffer, useArith, numNames); + List> tokensList = new ArrayList>(numNames); + for(int i = 0; i < numNames; i++) { + tokensList.add(new ArrayList<>()); + } + String decodedNamesString = ""; + for (int i = 0; i< numNames; i++){ + decodedNamesString += decodeSingleName(tokenStreams, tokensList, i) + separator; + } + return decodedNamesString; + } + + private static String decodeSingleName(final TokenStreams tokenStreams, + final List> tokensList, + final int currentNameIndex) { + int type = tokenStreams.getTokenStreamBuffer(0,TOKEN_TYPE).get() & 0xFF; + final ByteBuffer distBuffer = tokenStreams.getTokenStreamBuffer(0,type).order(ByteOrder.LITTLE_ENDIAN); + final int dist = distBuffer.getInt() & 0xFFFFFFFF; + final int prevNameIndex = currentNameIndex - dist; + if (type == TOKEN_DUP){ + tokensList.add(currentNameIndex, tokensList.get(prevNameIndex)); + return String.join("", tokensList.get(currentNameIndex)); + } + int tokenPosition = 1; + do { + type = tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_TYPE).get() & 0xFF; + String currentToken = ""; + switch(type){ + case TOKEN_CHAR: + char currentTokenChar = (char) tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_CHAR).get(); + currentToken = String.valueOf(currentTokenChar); + break; + case TOKEN_STRING: + currentToken = readString(tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_STRING)); + break; + case TOKEN_DIGITS: + currentToken = getDigitsToken(tokenStreams, tokenPosition, TOKEN_DIGITS); + break; + case TOKEN_DIGITS0: + String digits0Token = getDigitsToken(tokenStreams, tokenPosition, TOKEN_DIGITS0); + int lenDigits0Token = tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_DZLEN).get() & 0xFF; + currentToken = leftPadNumber(digits0Token, lenDigits0Token); + break; + case TOKEN_DELTA: + currentToken = getDeltaToken(tokenStreams, tokenPosition, tokensList, prevNameIndex, TOKEN_DELTA); + break; + case TOKEN_DELTA0: + String delta0Token = getDeltaToken(tokenStreams, tokenPosition, tokensList, prevNameIndex, TOKEN_DELTA0); + final int lenDelta0Token = tokensList.get(prevNameIndex).get(tokenPosition-1).length(); + currentToken = leftPadNumber(delta0Token, lenDelta0Token); + break; + case TOKEN_MATCH: + currentToken = tokensList.get(prevNameIndex).get(tokenPosition-1); + break; + default: + break; + } + tokensList.get(currentNameIndex).add(tokenPosition-1,currentToken); + tokenPosition++; + } while (type!= TOKEN_END); + return String.join("", tokensList.get(currentNameIndex)); + } + + private static String getDeltaToken( + final TokenStreams tokenStreams, + final int tokenPosition, + final List> tokensList, + final int prevNameIndex, + final int tokenType) { + if (!(tokenType == TOKEN_DELTA || tokenType == TOKEN_DELTA0)){ + throw new CRAMException(String.format("Invalid tokenType : %s. tokenType must be either TOKEN_DELTA or TOKEN_DELTA0", tokenType)); + } + int prevToken; + try { + prevToken = Integer.parseInt(tokensList.get(prevNameIndex).get(tokenPosition -1)); + } catch (NumberFormatException e) { + String exceptionMessageSubstring = (tokenType == TOKEN_DELTA) ? "DIGITS or DELTA" : "DIGITS0 or DELTA0"; + throw new CRAMException(String.format("The token in the prior name must be of type %s", exceptionMessageSubstring), e); + } + final int deltaTokenValue = tokenStreams.getTokenStreamBuffer(tokenPosition,tokenType).get() & 0xFF; + return Long.toString(prevToken + deltaTokenValue); + } + + private static String getDigitsToken(final TokenStreams tokenStreams, final int tokenPosition, final int tokenType ) { + if (!(tokenType == TOKEN_DIGITS || tokenType == TOKEN_DIGITS0)){ + throw new CRAMException(String.format("Invalid tokenType : %s. tokenType must be either TOKEN_DIGITS or TOKEN_DIGITS0", tokenType)); + } + ByteBuffer digitsByteBuffer = tokenStreams.getTokenStreamBuffer(tokenPosition, tokenType); + digitsByteBuffer.order(ByteOrder.LITTLE_ENDIAN); + long digits = digitsByteBuffer.getInt() & 0xFFFFFFFFL; + return Long.toString(digits); + } + + private static String readString(ByteBuffer inputBuffer) { + // spec: We fetch one byte at a time from the value byte stream, + // appending to the name buffer until the byte retrieved is zero. + StringBuilder sb = new StringBuilder(); + byte b = inputBuffer.get(); + while (b != 0) { + sb.append((char) b); + b = inputBuffer.get(); + } + return sb.toString(); + } + + private static String leftPadNumber(String value, int len) { + // return value such that it is at least len bytes long with leading zeros + while (value.length() < len) { + value = "0" + value; + } + return value; + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java new file mode 100644 index 0000000000..51e497fb26 --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java @@ -0,0 +1,107 @@ +package htsjdk.samtools.cram.compression.nametokenisation; + +import htsjdk.samtools.cram.CRAMException; +import htsjdk.samtools.cram.compression.nametokenisation.tokens.Token; +import htsjdk.samtools.cram.compression.range.RangeDecode; +import htsjdk.samtools.cram.compression.rans.RANSDecode; +import htsjdk.samtools.cram.compression.rans.Utils; +import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +public class TokenStreams { + + public static final int TOKEN_TYPE = 0; + public static final int TOKEN_STRING = 1; + public static final int TOKEN_CHAR = 2; + public static final int TOKEN_DIGITS0 = 3; + public static final int TOKEN_DZLEN = 4; + public static final int TOKEN_DUP = 5; + public static final int TOKEN_DIGITS = 7; + public static final int TOKEN_DELTA = 8; + public static final int TOKEN_DELTA0 = 9; + public static final int TOKEN_MATCH = 10; + public static final int TOKEN_END = 12; + + private static final int TOTAL_TOKEN_TYPES = 13; + private static final int NEW_TOKEN_FLAG_MASK = 0x80; + private static final int DUP_TOKEN_FLAG_MASK = 0x40; + private static final int TYPE_TOKEN_FLAG_MASK = 0x3F; + + private final List> tokenStreams; + + public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final int numNames) { + // TokenStreams is a List of List of Tokens. + // The outer index corresponds to the type of the token + // and the inner index corresponds to the index of the current Name in the list of Names + tokenStreams = new ArrayList<>(TOTAL_TOKEN_TYPES); + for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) { + tokenStreams.add(new ArrayList<>()); + } + int tokenPosition = -1; + while (inputByteBuffer.hasRemaining()) { + final int tokenTypeFlags = inputByteBuffer.get() & 0xFF; + final boolean isNewToken = ((tokenTypeFlags & NEW_TOKEN_FLAG_MASK) != 0); + final boolean isDupToken = ((tokenTypeFlags & DUP_TOKEN_FLAG_MASK) != 0); + final int tokenType = (tokenTypeFlags & TYPE_TOKEN_FLAG_MASK); + if (tokenType < 0 || tokenType > 13) { + throw new CRAMException("Invalid Token tokenType: " + tokenType); + } + if (isNewToken) { + tokenPosition++; + if (tokenPosition > 0) { + // If newToken and not the first newToken + for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) { + final List tokenStream = tokenStreams.get(i); + if (tokenStream.size() < tokenPosition) { + tokenStream.add(new Token(ByteBuffer.allocate(0))); + } + if (tokenStream.size() < tokenPosition) { + throw new CRAMException("TokenStream is missing Token(s) at Token Type: " + i); + } + } + } + } + + if ((isNewToken) && (tokenType != TOKEN_TYPE)) { + + // Spec: if we have a byte stream B5,DIGIT S but no B5,T Y P E + // then we assume the contents of B5,T Y P E consist of one DIGITS tokenType + // followed by as many MATCH types as are needed. + final ByteBuffer typeDataByteBuffer = ByteBuffer.allocate(numNames); + for (int i = 0; i < numNames; i++) { + typeDataByteBuffer.put((byte) TOKEN_MATCH); + } + typeDataByteBuffer.rewind(); + typeDataByteBuffer.put(0, (byte) tokenType); + tokenStreams.get(0).add(new Token(typeDataByteBuffer)); + } + if (isDupToken) { + final int dupPosition = inputByteBuffer.get() & 0xFF; + final int dupType = inputByteBuffer.get() & 0xFF; + final Token dupTokenStream = new Token(tokenStreams.get(dupType).get(dupPosition).getByteBuffer().duplicate()); + tokenStreams.get(tokenType).add(tokenPosition,dupTokenStream); + } else { + final int clen = Utils.readUint7(inputByteBuffer); + final byte[] dataBytes = new byte[clen]; + inputByteBuffer.get(dataBytes, 0, clen); // offset in the dst byte array + final ByteBuffer uncompressedDataByteBuffer; + if (useArith != 0) { + RangeDecode rangeDecode = new RangeDecode(); + uncompressedDataByteBuffer = rangeDecode.uncompress(ByteBuffer.wrap(dataBytes)); + + } else { + RANSDecode ransdecode = new RANSNx16Decode(); + uncompressedDataByteBuffer = ransdecode.uncompress(ByteBuffer.wrap(dataBytes)); + } + tokenStreams.get(tokenType).add(tokenPosition,new Token(uncompressedDataByteBuffer)); + } + } + } + + public ByteBuffer getTokenStreamBuffer(final int position, final int type) { + return tokenStreams.get(type).get(position).getByteBuffer(); + } +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/Token.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/Token.java new file mode 100644 index 0000000000..dbff2195c3 --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/Token.java @@ -0,0 +1,17 @@ +package htsjdk.samtools.cram.compression.nametokenisation.tokens; + +import java.nio.ByteBuffer; + +public class Token { + + private final ByteBuffer byteBuffer; + + public Token(ByteBuffer inputByteBuffer) { + byteBuffer = inputByteBuffer; + } + + public ByteBuffer getByteBuffer() { + return byteBuffer; + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/utils.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/utils.java new file mode 100644 index 0000000000..727c0c96c2 --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/utils.java @@ -0,0 +1,17 @@ +package htsjdk.samtools.cram.compression.nametokenisation; + +import java.nio.ByteBuffer; + +public class utils { + + public static int readUint7(ByteBuffer cp) { + int i = 0; + int c; + do { + //read byte + c = cp.get(); + i = (i << 7) | (c & 0x7f); + } while ((c & 0x80) != 0); + return i; + } +} \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java new file mode 100644 index 0000000000..287510aabd --- /dev/null +++ b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java @@ -0,0 +1,87 @@ +package htsjdk.samtools.cram; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationDecode; +import org.apache.commons.compress.utils.IOUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class NameTokenizationInteropTest extends HtsjdkTest { + public static final String COMPRESSED_TOK_DIR = "tok3"; + + @DataProvider(name = "allNameTokenizationFiles") + public Object[][] getAllRansCodecsForRoundTrip() throws IOException { + + // params: + // compressed testfile path, uncompressed testfile path, NameTokenization decoder, + final List testCases = new ArrayList<>(); + for (Path path : getInteropNameTokenizationCompressedFiles()) { + Object[] objects = new Object[]{ + path, + getNameTokenizationUnCompressedFilePath(path), + new NameTokenisationDecode() + }; + testCases.add(objects); + } + return testCases.toArray(new Object[][]{}); + + } + + @Test( + dataProvider = "allNameTokenizationFiles", + description = "Uncompress the existing compressed file using htsjdk NameTokenization " + + "and compare it with the original file.") + public void testNameTokenizationDecoder( + final Path compressedFilePath, + final Path uncompressedFilePath, + final NameTokenisationDecode nameTokenisationDecode) throws IOException { + final InputStream preCompressedInteropStream = Files.newInputStream(compressedFilePath); + final ByteBuffer preCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(preCompressedInteropStream)); + final InputStream unCompressedInteropStream = Files.newInputStream(uncompressedFilePath); + final ByteBuffer unCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream)); + String decompressedNames = nameTokenisationDecode.uncompress(preCompressedInteropBytes); + ByteBuffer decompressedNamesBuffer = StandardCharsets.UTF_8.encode(decompressedNames); + Assert.assertEquals(decompressedNamesBuffer,unCompressedInteropBytes); + } + + // return a list of all NameTokenization encoded test data files in the htscodecs/tests/names/tok3 directory + private List getInteropNameTokenizationCompressedFiles() throws IOException { + final List paths = new ArrayList<>(); + Files.newDirectoryStream( + CRAMInteropTestUtils.getInteropTestDataLocation().resolve("names/"+COMPRESSED_TOK_DIR), + path -> Files.isRegularFile(path)) + .forEach(path -> paths.add(path)); + return paths; + } + + // Given a compressed test file path, return the corresponding uncompressed file path + public static final Path getNameTokenizationUnCompressedFilePath(final Path compressedInteropPath) { + String uncompressedFileName = getUncompressedFileName(compressedInteropPath.getFileName().toString()); + // Example compressedInteropPath: ../names/tok3/01.names.1 => unCompressedFilePath: ../names/01.names + return compressedInteropPath.getParent().getParent().resolve(uncompressedFileName); + } + + public static final String getUncompressedFileName(final String compressedFileName) { + // Returns original filename from compressed file name + int lastDotIndex = compressedFileName.lastIndexOf("."); + if (lastDotIndex >= 0) { + String fileName = compressedFileName.substring(0, lastDotIndex); + return fileName; + } else { + throw new CRAMException("The format of the compressed File Name is not as expected. " + + "The name of the compressed file should contain a perios followed by a number that" + + "indicates type of compression. Actual compressed file name = "+ compressedFileName); + } + } + +} \ No newline at end of file