Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NameTokenization Decoder #4

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
package htsjdk.samtools.cram.compression.nametokenisation;

import htsjdk.samtools.cram.CRAMException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.List;

import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_TYPE;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_STRING;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_CHAR;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS0;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DZLEN;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DUP;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA0;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_MATCH;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_END;

public class NameTokenisationDecode {


public static String uncompress(final ByteBuffer inBuffer) {
return uncompress(inBuffer, "\n");
}

public static String uncompress(final ByteBuffer inBuffer, final String separator) {
inBuffer.order(ByteOrder.LITTLE_ENDIAN);
final int uncompressedLength = inBuffer.getInt() & 0xFFFFFFFF; //unused variable. Following the spec
final int numNames = inBuffer.getInt() & 0xFFFFFFFF;
final int useArith = inBuffer.get() & 0xFF;
TokenStreams tokenStreams = new TokenStreams(inBuffer, useArith, numNames);
List<List<String>> tokensList = new ArrayList<List<String>>(numNames);
for(int i = 0; i < numNames; i++) {
tokensList.add(new ArrayList<>());
}
String decodedNamesString = "";
for (int i = 0; i< numNames; i++){
decodedNamesString += decodeSingleName(tokenStreams, tokensList, i) + separator;
}
return decodedNamesString;
}

private static String decodeSingleName(final TokenStreams tokenStreams,
final List<List<String>> tokensList,
final int currentNameIndex) {
int type = tokenStreams.getTokenStreamBuffer(0,TOKEN_TYPE).get() & 0xFF;
final ByteBuffer distBuffer = tokenStreams.getTokenStreamBuffer(0,type).order(ByteOrder.LITTLE_ENDIAN);
final int dist = distBuffer.getInt() & 0xFFFFFFFF;
final int prevNameIndex = currentNameIndex - dist;
if (type == TOKEN_DUP){
tokensList.add(currentNameIndex, tokensList.get(prevNameIndex));
return String.join("", tokensList.get(currentNameIndex));
}
int tokenPosition = 1;
do {
type = tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_TYPE).get() & 0xFF;
String currentToken = "";
switch(type){
case TOKEN_CHAR:
char currentTokenChar = (char) tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_CHAR).get();
currentToken = String.valueOf(currentTokenChar);
break;
case TOKEN_STRING:
currentToken = readString(tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_STRING));
break;
case TOKEN_DIGITS:
currentToken = getDigitsToken(tokenStreams, tokenPosition, TOKEN_DIGITS);
break;
case TOKEN_DIGITS0:
String digits0Token = getDigitsToken(tokenStreams, tokenPosition, TOKEN_DIGITS0);
int lenDigits0Token = tokenStreams.getTokenStreamBuffer(tokenPosition, TOKEN_DZLEN).get() & 0xFF;
currentToken = leftPadNumber(digits0Token, lenDigits0Token);
break;
case TOKEN_DELTA:
currentToken = getDeltaToken(tokenStreams, tokenPosition, tokensList, prevNameIndex, TOKEN_DELTA);
break;
case TOKEN_DELTA0:
String delta0Token = getDeltaToken(tokenStreams, tokenPosition, tokensList, prevNameIndex, TOKEN_DELTA0);
final int lenDelta0Token = tokensList.get(prevNameIndex).get(tokenPosition-1).length();
currentToken = leftPadNumber(delta0Token, lenDelta0Token);
break;
case TOKEN_MATCH:
currentToken = tokensList.get(prevNameIndex).get(tokenPosition-1);
break;
default:
break;
}
tokensList.get(currentNameIndex).add(tokenPosition-1,currentToken);
tokenPosition++;
} while (type!= TOKEN_END);
return String.join("", tokensList.get(currentNameIndex));
}

private static String getDeltaToken(
final TokenStreams tokenStreams,
final int tokenPosition,
final List<List<String>> tokensList,
final int prevNameIndex,
final int tokenType) {
if (!(tokenType == TOKEN_DELTA || tokenType == TOKEN_DELTA0)){
throw new CRAMException(String.format("Invalid tokenType : %s. tokenType must be either TOKEN_DELTA or TOKEN_DELTA0", tokenType));
}
int prevToken;
try {
prevToken = Integer.parseInt(tokensList.get(prevNameIndex).get(tokenPosition -1));
} catch (NumberFormatException e) {
String exceptionMessageSubstring = (tokenType == TOKEN_DELTA) ? "DIGITS or DELTA" : "DIGITS0 or DELTA0";
throw new CRAMException(String.format("The token in the prior name must be of type %s", exceptionMessageSubstring), e);
}
final int deltaTokenValue = tokenStreams.getTokenStreamBuffer(tokenPosition,tokenType).get() & 0xFF;
return Long.toString(prevToken + deltaTokenValue);
}

private static String getDigitsToken(final TokenStreams tokenStreams, final int tokenPosition, final int tokenType ) {
if (!(tokenType == TOKEN_DIGITS || tokenType == TOKEN_DIGITS0)){
throw new CRAMException(String.format("Invalid tokenType : %s. tokenType must be either TOKEN_DIGITS or TOKEN_DIGITS0", tokenType));
}
ByteBuffer digitsByteBuffer = tokenStreams.getTokenStreamBuffer(tokenPosition, tokenType);
digitsByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
long digits = digitsByteBuffer.getInt() & 0xFFFFFFFFL;
return Long.toString(digits);
}

private static String readString(ByteBuffer inputBuffer) {
// spec: We fetch one byte at a time from the value byte stream,
// appending to the name buffer until the byte retrieved is zero.
StringBuilder sb = new StringBuilder();
byte b = inputBuffer.get();
while (b != 0) {
sb.append((char) b);
b = inputBuffer.get();
}
return sb.toString();
}

private static String leftPadNumber(String value, int len) {
// return value such that it is at least len bytes long with leading zeros
while (value.length() < len) {
value = "0" + value;
}
return value;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package htsjdk.samtools.cram.compression.nametokenisation;

import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.compression.nametokenisation.tokens.Token;
import htsjdk.samtools.cram.compression.range.RangeDecode;
import htsjdk.samtools.cram.compression.rans.RANSDecode;
import htsjdk.samtools.cram.compression.rans.Utils;
import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;

public class TokenStreams {

public static final int TOKEN_TYPE = 0;
public static final int TOKEN_STRING = 1;
public static final int TOKEN_CHAR = 2;
public static final int TOKEN_DIGITS0 = 3;
public static final int TOKEN_DZLEN = 4;
public static final int TOKEN_DUP = 5;
public static final int TOKEN_DIGITS = 7;
public static final int TOKEN_DELTA = 8;
public static final int TOKEN_DELTA0 = 9;
public static final int TOKEN_MATCH = 10;
public static final int TOKEN_END = 12;

private static final int TOTAL_TOKEN_TYPES = 13;
private static final int NEW_TOKEN_FLAG_MASK = 0x80;
private static final int DUP_TOKEN_FLAG_MASK = 0x40;
private static final int TYPE_TOKEN_FLAG_MASK = 0x3F;

private final List<List<Token>> tokenStreams;

public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final int numNames) {
// TokenStreams is a List of List of Tokens.
// The outer index corresponds to the type of the token
// and the inner index corresponds to the index of the current Name in the list of Names
tokenStreams = new ArrayList<>(TOTAL_TOKEN_TYPES);
for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) {
tokenStreams.add(new ArrayList<>());
}
int tokenPosition = -1;
while (inputByteBuffer.hasRemaining()) {
final int tokenTypeFlags = inputByteBuffer.get() & 0xFF;
final boolean isNewToken = ((tokenTypeFlags & NEW_TOKEN_FLAG_MASK) != 0);
final boolean isDupToken = ((tokenTypeFlags & DUP_TOKEN_FLAG_MASK) != 0);
final int tokenType = (tokenTypeFlags & TYPE_TOKEN_FLAG_MASK);
if (tokenType < 0 || tokenType > 13) {
throw new CRAMException("Invalid Token tokenType: " + tokenType);
}
if (isNewToken) {
tokenPosition++;
if (tokenPosition > 0) {
// If newToken and not the first newToken
for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) {
final List<Token> tokenStream = tokenStreams.get(i);
if (tokenStream.size() < tokenPosition) {
tokenStream.add(new Token(ByteBuffer.allocate(0)));
}
if (tokenStream.size() < tokenPosition) {
throw new CRAMException("TokenStream is missing Token(s) at Token Type: " + i);
}
}
}
}

if ((isNewToken) && (tokenType != TOKEN_TYPE)) {

// Spec: if we have a byte stream B5,DIGIT S but no B5,T Y P E
// then we assume the contents of B5,T Y P E consist of one DIGITS tokenType
// followed by as many MATCH types as are needed.
final ByteBuffer typeDataByteBuffer = ByteBuffer.allocate(numNames);
for (int i = 0; i < numNames; i++) {
typeDataByteBuffer.put((byte) TOKEN_MATCH);
}
typeDataByteBuffer.rewind();
typeDataByteBuffer.put(0, (byte) tokenType);
tokenStreams.get(0).add(new Token(typeDataByteBuffer));
}
if (isDupToken) {
final int dupPosition = inputByteBuffer.get() & 0xFF;
final int dupType = inputByteBuffer.get() & 0xFF;
final Token dupTokenStream = new Token(tokenStreams.get(dupType).get(dupPosition).getByteBuffer().duplicate());
tokenStreams.get(tokenType).add(tokenPosition,dupTokenStream);
} else {
final int clen = Utils.readUint7(inputByteBuffer);
final byte[] dataBytes = new byte[clen];
inputByteBuffer.get(dataBytes, 0, clen); // offset in the dst byte array
final ByteBuffer uncompressedDataByteBuffer;
if (useArith != 0) {
RangeDecode rangeDecode = new RangeDecode();
uncompressedDataByteBuffer = rangeDecode.uncompress(ByteBuffer.wrap(dataBytes));

} else {
RANSDecode ransdecode = new RANSNx16Decode();
uncompressedDataByteBuffer = ransdecode.uncompress(ByteBuffer.wrap(dataBytes));
}
tokenStreams.get(tokenType).add(tokenPosition,new Token(uncompressedDataByteBuffer));
}
}
}

public ByteBuffer getTokenStreamBuffer(final int position, final int type) {
return tokenStreams.get(type).get(position).getByteBuffer();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package htsjdk.samtools.cram.compression.nametokenisation.tokens;

import java.nio.ByteBuffer;

public class Token {

private final ByteBuffer byteBuffer;

public Token(ByteBuffer inputByteBuffer) {
byteBuffer = inputByteBuffer;
}

public ByteBuffer getByteBuffer() {
return byteBuffer;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package htsjdk.samtools.cram.compression.nametokenisation;

import java.nio.ByteBuffer;

public class utils {

public static int readUint7(ByteBuffer cp) {
int i = 0;
int c;
do {
//read byte
c = cp.get();
i = (i << 7) | (c & 0x7f);
} while ((c & 0x80) != 0);
return i;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package htsjdk.samtools.cram;

import htsjdk.HtsjdkTest;
import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationDecode;
import org.apache.commons.compress.utils.IOUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

public class NameTokenizationInteropTest extends HtsjdkTest {
public static final String COMPRESSED_TOK_DIR = "tok3";

@DataProvider(name = "allNameTokenizationFiles")
public Object[][] getAllRansCodecsForRoundTrip() throws IOException {

// params:
// compressed testfile path, uncompressed testfile path, NameTokenization decoder,
final List<Object[]> testCases = new ArrayList<>();
for (Path path : getInteropNameTokenizationCompressedFiles()) {
Object[] objects = new Object[]{
path,
getNameTokenizationUnCompressedFilePath(path),
new NameTokenisationDecode()
};
testCases.add(objects);
}
return testCases.toArray(new Object[][]{});

}

@Test(
dataProvider = "allNameTokenizationFiles",
description = "Uncompress the existing compressed file using htsjdk NameTokenization " +
"and compare it with the original file.")
public void testNameTokenizationDecoder(
final Path compressedFilePath,
final Path uncompressedFilePath,
final NameTokenisationDecode nameTokenisationDecode) throws IOException {
final InputStream preCompressedInteropStream = Files.newInputStream(compressedFilePath);
final ByteBuffer preCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(preCompressedInteropStream));
final InputStream unCompressedInteropStream = Files.newInputStream(uncompressedFilePath);
final ByteBuffer unCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream));
String decompressedNames = nameTokenisationDecode.uncompress(preCompressedInteropBytes);
ByteBuffer decompressedNamesBuffer = StandardCharsets.UTF_8.encode(decompressedNames);
Assert.assertEquals(decompressedNamesBuffer,unCompressedInteropBytes);
}

// return a list of all NameTokenization encoded test data files in the htscodecs/tests/names/tok3 directory
private List<Path> getInteropNameTokenizationCompressedFiles() throws IOException {
final List<Path> paths = new ArrayList<>();
Files.newDirectoryStream(
CRAMInteropTestUtils.getInteropTestDataLocation().resolve("names/"+COMPRESSED_TOK_DIR),
path -> Files.isRegularFile(path))
.forEach(path -> paths.add(path));
return paths;
}

// Given a compressed test file path, return the corresponding uncompressed file path
public static final Path getNameTokenizationUnCompressedFilePath(final Path compressedInteropPath) {
String uncompressedFileName = getUncompressedFileName(compressedInteropPath.getFileName().toString());
// Example compressedInteropPath: ../names/tok3/01.names.1 => unCompressedFilePath: ../names/01.names
return compressedInteropPath.getParent().getParent().resolve(uncompressedFileName);
}

public static final String getUncompressedFileName(final String compressedFileName) {
// Returns original filename from compressed file name
int lastDotIndex = compressedFileName.lastIndexOf(".");
if (lastDotIndex >= 0) {
String fileName = compressedFileName.substring(0, lastDotIndex);
return fileName;
} else {
throw new CRAMException("The format of the compressed File Name is not as expected. " +
"The name of the compressed file should contain a perios followed by a number that" +
"indicates type of compression. Actual compressed file name = "+ compressedFileName);
}
}

}