1 package com.larvalabs.starfish.examples.sequence;
2 
3 import com.larvalabs.starfish.algorithm.*;
4 import com.larvalabs.starfish.algorithm.parameter.*;
5 import com.larvalabs.starfish.util.Uuid;
6 import jaligner.Alignment;
7 import jaligner.Matrix;
8 import jaligner.Sequence;
9 import jaligner.SmithWatermanGotoh;
10import jaligner.formats.Format;
11import jaligner.formats.Pair;
12import jaligner.util.MatrixLoader;
13import org.apache.commons.logging.Log;
14import org.apache.commons.logging.LogFactory;
15
16import java.io.*;
17
18/**
19 * Algorithm that does JAlign sequence alignment on DNA sequences.
20 */
21public class JAlignAlgorithm extends Algorithm {
22
23    /**
24     * Apache commons-logging Logging instance.
25     */
26    static Log log = LogFactory.getLog(JAlignAlgorithm.class);
27
28    // Stores the parameter values
29    private String keySequence;
30    private File dbSequenceFile;
31    private long dbSequenceLength;
32    private int segmentSize;
33    private int segmentOverlap;
34    private String scoringMatrix;
35    private float openGapPenalty;
36    private float extendGapPenalty;
37    private int desiredSegmentation;
38
39    // Constants that define the algorithm parameters and their descriptions.
40    public static final String PARAM_KEY_SEQUENCE = "key.sequence";
41    public static final String DESC_KEY_SEQUENCE = "Search Sequence";
42    public static final String PARAM_DB_SEQUENCE_FILENAME = "db.sequence.filename";
43    public static final String DESC_DB_SEQUENCE_FILENAME = "DB Sequence File";
44    public static final String PARAM_SCORING_MATRIX = "scoring.matrix";
45    public static final String DESC_SCORING_MATRIX = "Scoring Matrix";
46    public static final String PARAM_OPEN_GAP_PENALTY = "open.gap.penalty";
47    public static final String DESC_OPEN_GAP_PENALTY = "Open Gap Penalty";
48    public static final String PARAM_EXTEND_GAP_PENALTY = "extend.gap.penalty";
49    public static final String DESC_EXTEND_GAP_PENALTY = "Extend Gap Penalty";
50    public static final String PARAM_DESIRED_SEGMENTATION = "desired.segmentation";
51    public static final String DESC_DESIRED_SEGMENTATION = "Desired Segmentation";
52
53    /**
54     * These are scoring matrices available for DNA data.
55     */
56    public static final String[] SCORING_MATRICES =
57            {
58                "EDNAFULL",
59                "BLOSUM30",
60                "BLOSUM40",
61                "BLOSUM50",
62                "BLOSUM60",
63                "BLOSUM70",
64                "BLOSUM80",
65                "BLOSUM90"
66            };
67
68    /**
69     * This is the default segmentation that we will use if the user does not override it.
70     */
71    public static final int DEFAULT_SEGMENTATION = 100;
72
73    /**
74     * This returns a human-readable name for our algorithm.
75     * @return
76     */
77    public String getName() {
78        return "Smith-Waterman-Gotoh Sequence Alignment";
79    }
80
81    /**
82     * This method allows our algorithm to specify our parameters to Starfish.
83     * @param parameterSet we will use this object to set our parameters.
84     */
85    public void setParameters(ParameterSet parameterSet) {
86        // This is the key sequence that we are searching for. We use a regular expression for validation.
87        StringType keyParam = new StringType(PARAM_KEY_SEQUENCE, DESC_KEY_SEQUENCE, true, 0, "[atgcATGC]*", "Key sequence must be a nucleic sequence.", null);
88        keyParam.setExpectingLargeInput(true);
89        parameterSet.addParameter(keyParam);
90        // This is the filename of the large, databse sequence. This must exist on the node that is creating the problem.
91        FileType fileParam = new FileType(PARAM_DB_SEQUENCE_FILENAME, DESC_DB_SEQUENCE_FILENAME, true, null, null);
92        parameterSet.addParameter(fileParam);
93        // This is the scoring matrix to use. Note that we specify a default value.
94        SelectorType scoreParam = new SelectorType(PARAM_SCORING_MATRIX, DESC_SCORING_MATRIX, true, SCORING_MATRICES, SCORING_MATRICES[0]);
95        parameterSet.addParameter(scoreParam);
96        // This is the penalty incurred for opening a gap between the key and DB sequences. It defaults to 10.
97        FloatType openGapParam = new FloatType(PARAM_OPEN_GAP_PENALTY, DESC_OPEN_GAP_PENALTY, true, new Float(10f));
98        parameterSet.addParameter(openGapParam);
99        // This is the penalty incurred for continuing an already-opened gap. It defaults to 0.5.
00        FloatType extendGapParam = new FloatType(PARAM_EXTEND_GAP_PENALTY, DESC_EXTEND_GAP_PENALTY, true, new Float(0.5f));
01        parameterSet.addParameter(extendGapParam);
02        // This is an optional parameter. It allows the user to specify how segmented they would like the problem's
03        // computation to be. If not specified, the algorithm will determine a good value on its own.
04        IntegerType segmentationParam = new IntegerType(PARAM_DESIRED_SEGMENTATION, DESC_DESIRED_SEGMENTATION, false, null);
05        parameterSet.addParameter(segmentationParam);
06    }
07
08    /**
09     * This method passes in the values for the our specified parameters.
10     * These are guaranteed to be valid according to the requirements we specified in {@link #setParameters(com.larvalabs.starfish.algorithm.ParameterSet)}.
11     * @param parameters the parameter values.
12     */
13    public void initialize(Parameters parameters) {
14        keySequence = (String) parameters.getParameter(PARAM_KEY_SEQUENCE);
15        segmentOverlap = keySequence.length();
16        dbSequenceFile = (File) parameters.getParameter(PARAM_DB_SEQUENCE_FILENAME);
17        scoringMatrix = (String) parameters.getParameter(PARAM_SCORING_MATRIX);
18        openGapPenalty = ((Float) parameters.getParameter(PARAM_OPEN_GAP_PENALTY)).floatValue();
19        extendGapPenalty = ((Float) parameters.getParameter(PARAM_EXTEND_GAP_PENALTY)).floatValue();
20        Integer segmentation = (Integer)parameters.getParameter(PARAM_DESIRED_SEGMENTATION);
21        if (segmentation != null) {
22            desiredSegmentation = segmentation.intValue();
23        } else {
24            desiredSegmentation = DEFAULT_SEGMENTATION;
25        }
26    }
27
28    /**
29     * Here we compute the total number of segments.
30     * We try to use the desired segmentation requested by the user.
31     * @return the total number of segments for this problem.
32     */
33    public int totalNumberOfSegments() {
34        dbSequenceLength = dbSequenceFile.length();
35        segmentSize = (int)Math.max(dbSequenceLength/desiredSegmentation, 10 * segmentOverlap);
36        log.debug("Segment size: " + segmentSize);
37        double k = 1 + ((double) dbSequenceLength - segmentSize) / (segmentSize - segmentOverlap);
38        return (int) Math.ceil(k);
39    }
40
41    /**
42     * This generates the overlapping subsequences that we will search in order to find the optimal match.
43     * @param segmentInputData this is where we register the subsequences as Starfish segment data.
44     */
45    public void generateSegmentInputData(SegmentInputData segmentInputData) {
46        try {
47            int k = totalNumberOfSegments();
48            BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(dbSequenceFile)));
49            long charsRemaining = dbSequenceLength;
50            char[] lastSegment = null;
51            long segmentStartIndex = 0;
52            for (int i = 0; i < k; i++) {
53                int currentSegmentSize = segmentSize;
54                if (segmentSize > charsRemaining) {
55                    currentSegmentSize = (int) charsRemaining;
56                }
57                char[] segment = new char[currentSegmentSize];
58                int j = 0;
59                if (lastSegment != null) {
60                    int lastIndex = lastSegment.length - segmentOverlap;
61                    while (j < segmentOverlap) {
62                        segment[j] = lastSegment[lastIndex];
63                        j++;
64                        lastIndex++;
65                    }
66                }
67                int charsToRead = currentSegmentSize - j;
68                in.read(segment, j, charsToRead);
69                String segmentString = new String(segment);
70                segmentInputData.setSegmentInputData(i, new SequenceSegment(segmentString, segmentStartIndex - segmentOverlap));
71                charsRemaining -= charsToRead;
72                segmentStartIndex += charsToRead;
73                lastSegment = segment;
74            }
75            in.close();
76        } catch (Exception e) {
77            log.error("Could not read db sequence!", e);
78        }
79    }
80
81    /**
82     * This is the "meat" of the algorithm. Here we compute the optimal alignment of the key sequence with the
83     * subsequence passed to us in <code>segmentData</cope>
84     * @param segmentNum the index of this segment.
85     * @param segmentData the segment data-- in this case it is a subsequence.
86     * @return we return the best alignment result together with scoring information about the alignment.
87     */
88    public Serializable processSegment(int segmentNum, Object segmentData) {
89        try {
90            SequenceSegment segment = (SequenceSegment) segmentData;
91            String dbSequence = segment.getSequence();
92            Sequence s1 = new Sequence(keySequence, "Key", "Key Sequence", Sequence.NUCLEIC);
93            Sequence s2 = new Sequence(dbSequence, "DB", "Database Sequence", Sequence.NUCLEIC);
94            Matrix matrix = MatrixLoader.load(scoringMatrix);
95            Alignment alignment = SmithWatermanGotoh.align(s1, s2, matrix, openGapPenalty, extendGapPenalty);
96            alignment.setStart2((int)(alignment.getStart2() + segment.getStartIndex()));
97            AlignmentResult result = new AlignmentResult(alignment);
98            return result;
99        } catch (Exception e) {
00            log.error("Error processing alignment!", e);
01            return null;
02        }
03    }
04
05    /**
06     * Finally, we will determine which segment found the best alignment and report that as the optimal.
07     * @param results the Universally-Unique Identifiers of the results.
08     * @param resources a handle for acquiring the segment result resources.
09     * @return the problem result, in the case a string with the resulting alignment and score.
10     */
11    public ProblemResult processResults(Uuid[] results, Resources resources) {
12        AlignmentResult best = null;
13        for (int i = 0; i < results.length; i++) {
14            Uuid uuid = (Uuid) results[i];
15            AlignmentResult result = (AlignmentResult) resources.getResource(uuid);
16            if ((best == null) || (best.getScore() < result.getScore())) {
17                best = result;
18            }
19        }
20        Alignment alignment = best.getAlignment();
21        log.debug("Start1: " + alignment.getStart1());
22        log.debug("Start2: " + alignment.getStart2());
23        Format format = new Pair();
24        log.debug("Score: " + alignment.getScore());
25        String output = "Score: " + alignment.getScore() + "\nAlignment:\n" + format.format(alignment);        
26        log.debug(output);
27        return ProblemResult.createStringProblemResult(output);
28    }
29}
30