HadoopGenomics · ilveroluca · Jul 3, 2014 · Jul 3, 2014 · Aug 7, 2014 · Aug 7, 2014
diff --git a/doc/seqpig_reference.tex b/doc/seqpig_reference.tex
@@ -199,12 +199,10 @@ \subsubsection{\texttt{BamStorer} and \texttt{SamStorer}}
 & \emph{attributes} & \texttt{map} & SAMRecord attributes
 \end{tabular}
 
-\subsubsection{\texttt{FastqLoader} and \texttt{QseqLoader}}
+\subsubsection{\texttt{FastqLoader}}
 
-Both loaders for unaligned read file formats Fastq and Qseq
-essentially provide the same output schema for the tuple field names
-they produce.  Note that some fields that are not present in the input
-data may remain empty.
+The loader for Fastq files provides the following output schema.  Note that
+fields that are not present in the input data remain empty.
 
 \begin{tabular}{lp{0.15\textwidth}p{0.15\textwidth}p{0.4\textwidth}}
 Usage: & \multicolumn{3}{l}{}
@@ -227,17 +225,23 @@ \subsubsection{\texttt{FastqLoader} and \texttt{QseqLoader}}
 & \emph{control\_number} & \texttt{integer} & control number\\
 & \emph{index\_sequence} & \texttt{chararray} & index sequence\\
 & \emph{sequence} & \texttt{chararray} & read bases\\
-& \emph{quality} & \texttt{chararray} & base qualities
+& \emph{quality} & \texttt{chararray} & base qualities (ASCII-encoded Sanger
+format)\\
+& \emph{id} & \texttt{chararray} & entire read id (as found in input)
 \end{tabular}
 
-\subsubsection{\texttt{FastqStorer} and \texttt{QseqStorer}}
+\subsubsection{\texttt{FastqStorer}}
 
-The Fastq and Qseq storer input schemas are identical and both are
-essentially equal to the output schema of the corresponding loader
-functions. Note that the order of the fields inside tuples does not
-matter, only their field names need to be present. All fields except
+The \texttt{FastqStorer}'s input schema is identical to the corresponding loader
+function's output schema. Note that the order of the fields inside tuples does
+not matter -- only their field names need to be present. All fields except
 \emph{sequence} and \emph{quality} are optional.
 
+The ``id'' field is optional and should be used only to override the default
+read id format (which follows the standard used by Illumina's Fastq format) with
+a custom one; if ``id'' is set the provided string will be used as the read id
+and the other metadata fields will be ignored.
+
 \begin{tabular}{lp{0.15\textwidth}p{0.15\textwidth}p{0.4\textwidth}}
 Usage: & \multicolumn{3}{l}{}
 \hspace*{-0.55cm}\begin{minipage}{0.8\textwidth}
@@ -259,9 +263,29 @@ \subsubsection{\texttt{FastqStorer} and \texttt{QseqStorer}}
 & \emph{control\_number} & \texttt{integer} & control number\\
 & \emph{index\_sequence} & \texttt{chararray} & index sequence\\
 & \emph{sequence} & \texttt{chararray} & read bases\\
-& \emph{quality} & \texttt{chararray} & base qualities
+& \emph{quality} & \texttt{chararray} & base qualities\\
+& \emph{id} & \texttt{chararray} & id of the read (overrides meta data fields)
 \end{tabular}
 
+\subsubsection{\texttt{QseqLoader}}
+
+The \texttt{QseqLoader} essentially produces the same output schema as the
+\texttt{FastqLoader}, with two differences:  it does not produce the
+``control\_number'' and ``id'' fields, since these fields are not present in
+Qseq files.
+
+Note that the base qualities coming from the Qseq format are transformed into
+Sanger $q + 33$ format.
+
+\subsubsection{\texttt{QseqStorer}}
+
+Analogously to the \texttt{FastqStorer}, the \texttt{QseqStorer}'s 
+input schema is identical to the output for the corresponding loader function.
+Note that the order of the fields inside tuples does not
+matter; only their field names need to be present. All fields except
+\emph{sequence} and \emph{quality} are optional.
+
+
 \subsubsection{\texttt{FastaLoader}}
 
 The FastaLoader loads reference sequence data in FASTA format and

diff --git a/src/fi/aalto/seqpig/io/FastqLoader.java b/src/fi/aalto/seqpig/io/FastqLoader.java
@@ -77,6 +77,7 @@ public class FastqLoader extends LoadFunc implements LoadMetadata {
     //   index_sequence: string
     //   sequence: string
     //   quality: string (note: we assume that encoding chosen on command line!!!)
+    //   id: string
 
     public FastqLoader() {}
 
@@ -95,8 +96,6 @@ public Tuple getNext() throws IOException {
             Text fastqrec_name = ((FastqRecordReader)in).getCurrentKey();
             SequencedFragment fastqrec = ((FastqRecordReader)in).getCurrentValue();
 
-            //mProtoTuple.add(new String(fastqrec_name.toString()));
-
             mProtoTuple.add(fastqrec.getInstrument());
             mProtoTuple.add(fastqrec.getRunNumber());
             mProtoTuple.add(fastqrec.getFlowcellId());
@@ -110,6 +109,7 @@ public Tuple getNext() throws IOException {
             mProtoTuple.add(fastqrec.getIndexSequence());
             mProtoTuple.add(fastqrec.getSequence().toString());
             mProtoTuple.add(fastqrec.getQuality().toString());
+            mProtoTuple.add(fastqrec_name.toString());
 
             Tuple t =  mTupleFactory.newTupleNoCopy(mProtoTuple);
             mProtoTuple = null;
@@ -152,6 +152,7 @@ public ResourceSchema getSchema(String location, Job job) throws IOException {
         s.add(new Schema.FieldSchema("index_sequence", DataType.CHARARRAY));
         s.add(new Schema.FieldSchema("sequence", DataType.CHARARRAY));
         s.add(new Schema.FieldSchema("quality", DataType.CHARARRAY));
+        s.add(new Schema.FieldSchema("id", DataType.CHARARRAY));
 
         return new ResourceSchema(s);
     }

diff --git a/src/fi/aalto/seqpig/io/FastqStorer.java b/src/fi/aalto/seqpig/io/FastqStorer.java
@@ -171,8 +171,14 @@ public void putNext(Tuple f) throws IOException {
             fastqrec.setQuality(new Text((String)f.get(index)));
         }
 
+        Text key = null;
+        index = getFieldIndex("id", allFastqFieldNames);
+        if(index > -1 && DataType.findType(f.get(index)) == DataType.CHARARRAY) {
+            key = new Text((String)f.get(index));
+        }
+
         try {
-            writer.write(null, fastqrec);
+            writer.write(key, fastqrec);
         } catch (InterruptedException e) {
             throw new IOException(e);
         }