#!/usr/pkg/bin/python
Copyright = """
    fa2mid.py - convert from fastA format to standard MIDI
    Copyright (C) 2003  John Comeau <jc@jcomeau.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
    """

try:
    import sys, os, re
except:
    sys.stderr.write("Needs os and re; upgrade preferably to version 2.3\n")
    sys.exit(1)

# Global data
# define boolean True on older Python interpreters (before 2.3)
try:
    True
except:
    True = 1
    False = 0
LongLength = 4
ShortLength = 2
Octave = 12 # 12 half-notes make an octave
NormalVelocity = 64 # use for note off events, base all notes around this
ByteMask = 0xffL # must be longword to avoid warnings about left shifts
DeltaZero = chr(0) # delta-time of zero prefixes many MIDI events
EndOfTrack = DeltaZero + "\xff\x2f\x00"
LyricEvent = DeltaZero + "\xff\x05"
TextEvent = DeltaZero + "\xff\x01"
CopyrightNotice = DeltaZero + "\xff\x02"
RCSid = "$Id: fa2mid.py,v 1.75 2003/10/10 19:55:08 jcomeau Exp $"
MThd = "MThd" # MIDI header ID string
MTrk = "MTrk" # MIDI track ID string
# map bases to their inverse
BasePairs = {'T': 'A', 'C': 'G', 'A': 'T', 'G': 'C',
             't': 'a', 'c': 'g', 'a': 't', 'g': 'c', 'N': 'N'}
CurrentCodon = "NNN" # 'N' signifies unknown base
Frame = 0 # 0-based representation of frames 1, 2, or 3
Coding = {0: 'X', 1: 'X', 2: 'X'} # 'X' indicates genetic code terminator
EOL = "" # end-of-line characters found in input stream

# Subroutines
def VarLen(LongValue):
    Buffer, ReturnString = 0, ""
    while (LongValue > 0):
        Buffer = Buffer + (LongValue & 0x7f)
        LongValue = LongValue >> 7
        if (LongValue > 0):
            Buffer = Buffer << 8
            Buffer = Buffer | 0x80
    while True:
        ReturnString = ReturnString + chr(Buffer & ByteMask)
        if (Buffer & 0x80):
            Buffer = Buffer >> 8
        else:
            break
    return (ReturnString)

def ReadVarLen(Stream):
    Value, Byte, Signal = 0L, 0, 0x80
    while Signal > 0:
        Byte = ord(NextBytes(Stream, 1)) # NextByte() will croak on EOF
        Signal = Signal & Byte
        Byte = Byte & ~Signal
        Value = (Value << 7) + Byte
    return Value

def FixedLong(LongValue):
    Bytes, ReturnString = LongLength, ""
    for Index in range(Bytes, 0, -1):
        Mask = ByteMask << ((Index - 1) * 8)
        Byte = (LongValue & Mask) >> ((Index - 1) * 8)
        ReturnString = ReturnString + chr(Byte)
    return (ReturnString)

def ReadFixedLong(Stream):
    Value, Long = 0, ''
    Long = NextBytes(Stream, LongLength)
    for Index in range(0, LongLength):
        Value = (Value << 8) + ord(Long[Index:Index + 1])
    return Value
    
def FixedShort(ShortValue):
    Bytes, ReturnString = ShortLength, ""
    for Index in range(Bytes, 0, -1):
        Mask = ByteMask << ((Index - 1) * 8)
        Byte = (ShortValue & Mask) >> ((Index - 1) * 8)
        ReturnString = ReturnString + chr(Byte)
    return (ReturnString)

def ReadFixedShort(Stream):
    Value, Short = 0, ''
    Short = NextBytes(Stream, ShortLength)
    for Index in range(0, ShortLength):
        Value = (Value << 8) + ord(Short[Index:Index + 1])
    return Value

def NoteOn(Delay, Channel, Note, Velocity):
    return VarLen(Delay) + chr(0x90 + Channel) + chr(Note) + chr(Velocity)

def NoteOff(Delay, Channel, Note, Velocity):
    return VarLen(Delay) + chr(0x80 + Channel) + chr(Note) + chr(Velocity)
    
def NoteDef(Value, Velocity, Duration): # for bases
    Channel = 0
    return NoteOn(0, Channel, Value, Velocity) + \
        NoteOff(Duration, Channel, Value, NormalVelocity)

def NoteMapping():
    "Map letters for base-pairs and amino acids to Standard MIDI notes"
    notes = dict()
    nextnote = 57 # standard MIDI A2
    for letter in range(ord('A'), ord('Z') + 1):
        notes[chr(letter)] = nextnote
        if nextnote % 12 == 11 or nextnote % 12 == 4:
            nextnote = nextnote + 1 # 'mi' to 'fa', or 'ti' to 'do'
        else:
            nextnote = nextnote + 2
    return notes

def RelativeVelocity(BaseVelocity, NoteMidpoint, Note, Power, Options):
    "Adjust velocity so that high-pitched notes don't overpower lower notes"
    adjustment = float(NoteMidpoint) / float(Note)
    modifiedAdjustment = pow(adjustment, Power) # modify the curve
    velocity = float(BaseVelocity) * modifiedAdjustment
    velocity = min(velocity, 127.0) # cannot exceed velocity value of 127
    debugprint(Options, "adjusting velocity from " + str(BaseVelocity) + \
        " to " + str(int(velocity)))
    return int(velocity)

def Lyric(text):
    if len(text):
        return LyricEvent + VarLen(len(text)) + text
    else:
        return ''

def EmbedText(text):
    if len(text):
        return TextEvent + VarLen(len(text)) + text
    else:
        return ''

def EmbedCopyright(text):
    if len(text):
        return CopyrightNotice + VarLen(len(text)) + text
    else:
        return ''

def BaseNoteDef(Duration, Options):
    NoteMap = NoteMapping()
    MezzoPiano, MezzoForte = 64, 80 # really 48, 64 but this works better
    # drop everything down so the amino acids sound higher
    T = NoteMap['T'] - (2 * Octave)
    T = T - (Options.lower_t * Octave) # drop to level of F by default
    C = NoteMap['C'] - (2 * Octave)
    A = NoteMap['A'] - (2 * Octave)
    G = NoteMap['G'] - (2 * Octave)
    power = 0.8 # used to adjust relative velocity, obtained by trial+error
    notemap = {
        'T': NoteDef(T, RelativeVelocity(
            MezzoForte, C, T, power, Options), Duration),
        'C': NoteDef(C, RelativeVelocity(
            MezzoForte, C, C, power, Options), Duration),
        'A': NoteDef(A, RelativeVelocity(
            MezzoForte, C, A, power, Options), Duration),
        'G': NoteDef(G, RelativeVelocity(
            MezzoForte, C, G, power, Options), Duration),
    }
    if not Options.exons_only:
        notemap['t'] = NoteDef(T, RelativeVelocity(
            MezzoPiano, C, T, power, Options), Duration)
        notemap['c'] = NoteDef(C, RelativeVelocity(
            MezzoPiano, C, C, power, Options), Duration)
        notemap['a'] = NoteDef(A, RelativeVelocity(
            MezzoPiano, C, A, power, Options), Duration)
        notemap['g'] = NoteDef(G, RelativeVelocity(
            MezzoPiano, C, G, power, Options), Duration)
    if Options.lyriclevel > 0:
        for key, value in notemap.items():
            notemap[key] = Lyric(key) + value
    return notemap

def mRNA_mapping(Options):
    # map DNA to mRNA, uppercase letters if coding introns
    try:
        code_introns = Options.exons_only
    except:
        code_introns = False
    mRNA = {'T': 'U', 'C': 'C', 'A': 'A', 'G': 'G', 'N': 'N'}
    for base in ('t', 'c', 'a', 'g'):
        if code_introns:
            mRNA[base] = base
        else:
            mRNA[base] = mRNA[base.upper()]
    return mRNA

def AminoAcids():
    """Entire genetic code with all amino acid representations.

    Lists all amino acids normally coded by mRNA by their 1-letter
    and 3-letter abbreviations, and all the codons which code for them.
    Note the 'U' (Uracil) instead of 'T' (Thymine) since amino acids
    are produced using mRNA not DNA.
    """
    return {
        'G': ('Glycine', 'Gly', ('GGU', 'GGC', 'GGA', 'GGG',)),
        'A': ('Alanine', 'Ala', ('GCU', 'GCC', 'GCA', 'GCG',)),
        'V': ('Valine', 'Val', ('GTT', 'GTC', 'GTA', 'GTG',)),
        'L': ('Leucine', 'Leu', ('UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG',)),
        'I': ('Isoleucine', 'Ile', ('AUU', 'AUC', 'AUA',)),
        'P': ('Proline', 'Pro', ('CCU', 'CCC', 'CCA', 'CCG',)),
        'F': ('Phenylalanine', 'Phe', ('UUU', 'UUC',)),
        'Y': ('Tyrosine', 'Tyr', ('UAU', 'UAC',)),
        'W': ('Tryptophan', 'Trp', ('UGG',)),
        'S': ('Serine', 'Ser', ('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC',)),
        'T': ('Threonine', 'Thr', ('ACU', 'ACC', 'ACA', 'ACG',)),
        'C': ('Cysteine', 'Cys', ('UGU', 'UGC',)),
        'M': ('Methionine', 'Met', ('AUG',)),
        'N': ('Asparagine', 'Asn', ('AAU', 'AAC',)),
        'Q': ('Glutamine', 'Gln', ('CAA', 'CAG',)),
        'D': ('Aspartate', 'Asp', ('GAU', 'GAC',)),
        'E': ('Glutamate', 'Glu', ('GAA', 'GAG',)),
        'K': ('Lysine', 'Lys', ('AAA', 'AAG',)),
        'R': ('Arginine', 'Arg', ('CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG',)),
        'H': ('Histidine', 'His', ('CAU', 'CAC',)),
        'X': ('terminator', 'ter', ('UAA', 'UAG', 'UGA',)),
    }

def GeneticCode():
    "Returns a dictionary of codon keys with amino acid values"
    code = dict() # initialize dictionary
    for key, value in AminoAcids().items():
        for codon in value[2]:
            code[codon] = key
    return code

def Abbreviations():
    "Returns mapping of one-letter amino acid abbreviations for lyrics"
    abbr = dict()
    for key, value in AminoAcids().items():
        abbr[key] = ("", "", "(" + key + ")", "(" + value[1] + ")",
            "(" + value[0] + ")") # e. g. K: "", "", "(K)", "(Lys)", "(Lysine)"
    return abbr

def MidiHeader():
    ChunkLength = FixedLong(6) # 6 bytes
    MidiFileFormat = FixedShort(0) # 0, simplest format
    NumberOfTracks = FixedShort(1) # 1 track
    TimeDivision = FixedShort(96) # ticks per quarter note
    return MThd + ChunkLength + MidiFileFormat + NumberOfTracks + TimeDivision

def MidiTrackCommon(Options):
    TimeSignaturePrefix = DeltaZero + "\xff\x58" # meta-event for time signature
    TimeSignatureLength = chr(4) # length of data
    TimeSignatureNumerator = chr(4) # for 4/4 time
    TimeSignatureDenominator = chr(2) # 2 ** -dd, where dd = 2, gives 1/4
    TimeSignatureClocks = chr(24) # number of MIDI clocks in metronome tick
    TimeSignatureQuarterNote = chr(8) # 8 notated 32nd notes per 24 MIDI clocks
    TimeSignature = TimeSignaturePrefix + TimeSignatureLength + \
        TimeSignatureNumerator + TimeSignatureDenominator + \
        TimeSignatureClocks + TimeSignatureQuarterNote
    TempoPrefix = DeltaZero + "\xff\x51" # meta-event for tempo specification
    TempoLength = chr(3) # bytes for tempo data (24-bit time specification)
    TempoSpecification = FixedLong(500000)[1:4] # microseconds per MIDI 1/4 note
    Tempo = TempoPrefix + TempoLength + TempoSpecification
    CopyrightEvent = EmbedCopyright(Options.copyright)
    ProgramChange = DeltaZero + "\xc0" + chr(Options.instrument - 1) + \
        DeltaZero + "\xc1" + chr(Options.instrument1 - 1) + \
        DeltaZero + "\xc2" + chr(Options.instrument2 - 1) + \
        DeltaZero + "\xc3" + chr(Options.instrument3 - 1)
    # examples: 1 = Grand Piano, 13 = Marimba
    return (TimeSignature + Tempo + CopyrightEvent + ProgramChange)

def debugprint(options, text):
    if (options.verbose) and len(text) > 0:
        sys.stderr.write(text + "\n")

def GenomeNoteData(DataString, NoteMap, BaseNoteMap, Options):
    """Change a single line of base-pair data into notes.

    This is the most complex routine because of the different options
    it has to handle. If there are any serious bugs in the program
    they are likely to be found here"""
    Notes = ""
    Code = GeneticCode()
    global CurrentCodon, Frame, Coding, EOL
    abbreviations = Abbreviations()
    mRNA = mRNA_mapping(Options)
    if DataString[0:1] == '>': # comment line, so skip it
        return ""
    for Index in range(len(DataString)):
        Base = DataString[Index] # should be in GATCgatc but could be N or \n
        if BaseNoteMap.has_key(Base):
            debugprint(Options, "playing note for " + Base)
            if Options.lyriclevel > 0 and len(EOL) > 0:
                Notes = Notes + Lyric("\r\n") # for WinAmp
                EOL = ''
            Notes = Notes + BaseNoteMap[Base]
        elif Base < ' ': # any control character should be end-of-line
            EOL = EOL + Base
        else:
            debugprint(Options, "skipping datum " + Base)
        # after the note for that base is played, ...
        # ... start the note for the amino acid created by that codon
        if mRNA.has_key(Base):
            CurrentCodon = CurrentCodon[1:3] + mRNA[Base]
            delay = 0
            if Base == 'N': # there was no delay above
                delay = Options.duration
                if Options.lyriclevel > 0 and False: # can't do, we add 'NNN'
                    Notes = Notes + Lyric('N')
            if Coding[Frame] != 'X':
                debugprint(Options, "ending note " + Coding[Frame] + \
                    " after " + str(delay) + " MIDI clocks")
                Notes = Notes + NoteOff(delay, # end current note
                    Frame + 1, # first frame is 2nd channel
                    NoteMap[Coding[Frame]], NormalVelocity)
            if Code.has_key(CurrentCodon) and \
                Code[CurrentCodon] != 'X' and \
                (Options.code_all or \
                Coding[Frame] != 'X' or \
                Code[CurrentCodon] == 'M'):
                Coding[Frame] = Code[CurrentCodon]
                debugprint(Options,
                    "starting note for amino acid " + Coding[Frame] + \
                        " " + abbreviations[Coding[Frame]][4])
                velocity = RelativeVelocity(32, NoteMap['F'],
                    NoteMap[Coding[Frame]], 2.0, Options)
                Notes = Notes + \
                    Lyric(abbreviations[Coding[Frame]][Options.lyriclevel])
                Notes = Notes + NoteOn(0, Frame + 1, NoteMap[Coding[Frame]],
                    velocity)
            elif (not Code.has_key(CurrentCodon) or \
                Code[CurrentCodon] == 'X') and Coding[Frame] != 'X':
                debugprint(Options, "note was already ended above")
                Coding[Frame] = 'X'
            else:
                debugprint(Options, "setting Coding[Frame] to 'X'")
                Coding[Frame] = 'X'
            Frame = (Frame + 1) % 3
            debugprint(Options, "Frame is now " + str(Frame + 1))
    return Notes

def EndDataTrack(Options):
    "Clean up any loose ends then send end-of-track meta-event"
    Data, NoteMap = '', NoteMapping()
    for key, value in Coding.items():
        if value != 'X':
            debugprint(Options, "ending note for " + \
                value + " which should have ended before")
            Data = Data + NoteOff(0, key + 1, NoteMap[value], NormalVelocity)
    return Data + EndOfTrack

def CheckMinMax(optionInstance, option, value, parser, *arguments):
    (min, max) = arguments
    errorMessage = "value of " + option + " (" + str(value) +\
        ") must be between " + str(min) + " and " + str(max)
    if value < min or value > max:
        raise OptionValueError(errorMessage)
    else:
        setattr(parser.values, optionInstance.dest, value)

def initialize_fa2mid():
    try:
        from optparse import OptionParser, OptionValueError
    except:
        sys.stderr.write("Needs optparse, upgrade preferably to 2.3\n");
        sys.exit(1)
    EighthNotes, AcousticBass, ShowIntrons = 144, 33, False # defaults
    ChoirAahs, VoiceOohs, SynthVoice = 53, 54, 55
    parser = OptionParser(usage = \
        "%prog [options] INFILE|- [OUTFILE|-] (note: '-' means stdio)")
    parser.add_option("-d", "--duration", type = "int", action = "callback",
        callback = CheckMinMax, callback_args = (1, 2048), dest = "duration",
        default = EighthNotes, help = "MIDI clocks per eighth note")
    parser.add_option("-i", "--instrument", type = "int", action = "callback",
        callback = CheckMinMax, callback_args = (1, 128), dest = "instrument",
        default = AcousticBass,
        help = "1-based Standard MIDI instrument number for bases")
    parser.add_option("-1", "--frame1_instrument", type = "int",
        action = "callback", callback = CheckMinMax, callback_args = (1, 128),
        dest = "instrument1", default = ChoirAahs,
        help = "MIDI instrument number for frame 1 amino acids")
    parser.add_option("-2", "--frame2_instrument", type = "int",
        action = "callback", callback = CheckMinMax, callback_args = (1, 128),
        dest = "instrument2", default = ChoirAahs,
        help = "MIDI instrument number for frame 2 amino acids")
    parser.add_option("-3", "--frame3_instrument", type = "int",
        action = "callback", callback = CheckMinMax, callback_args = (1, 128),
        dest = "instrument3", default = ChoirAahs,
        help = "MIDI instrument number for frame 3 amino acids")
    parser.add_option("-l", "--with_lyrics", type = "int",
        action = "callback", callback = CheckMinMax, callback_args = (0, 4),
        dest = "lyriclevel", default = 0,
        help = "output genome data as MIDI lyrics (various verbosity levels)")
    parser.add_option("-t", "--lower_T", type = "int",
        action = "callback", callback = CheckMinMax, callback_args = (0, 2),
        dest = "lower_t", default = 2,
        help = "lower note for Thymine by this many octaves relative to C")
    parser.add_option("-c", "--copyright", default = "Created by: " + \
        program_name(sys.argv[0]).group(0) + \
            ' ' + ' '.join(sys.argv[1:]) + Copyright,
        help = "string (enclosed in quotes) to use as copyright notice")
    parser.add_option("-e", "--exons_only", action="store_true",
        default = ShowIntrons, help = "ignore lowercased (intron) data")
    parser.add_option("-a", "--code_all", action = "store_true",
        default = False, help = "don't wait for AUG to start coding")
    parser.add_option("-v", "--verbose", action = "store_true",
        default = False, help = "output debugging information while processing")
    Input, Output = "-", "-"
    options, arguments = parser.parse_args()
    if len(arguments) < 1:
        parser.print_help()
        sys.exit(0)
    elif arguments[0] != "-":
        Input, Output = arguments[0], arguments[0] + ".mid"
    if len(arguments) > 1:
        if arguments[1] != "-":
            Output = arguments[1]
    return (options, Input, Output)

def fa2mid():
    (Options, Input, Output) = initialize_fa2mid()
    debugprint(Options, "processing: " + ' '.join(sys.argv))
    if Input == "-":
        InputFile = sys.stdin
    else:
        InputFile = open(Input, "r")
    if Output == "-":
        OutputFile = sys.stdout
    else:
        OutputFile = open(Output, "wb")
    NoteMap = NoteMapping()
    BaseNoteMap = BaseNoteDef(Options.duration, Options)
    Adjustment = len(MidiHeader()) + len(MTrk) # to correct length of track data
    if Output == "-":
        # we'll have to buffer entire output; better have lots of swapspace!
        OutputData = MidiHeader() + MTrk + FixedLong(0) + \
            MidiTrackCommon(Options)
        while True:
            Line = InputFile.readline()
            if len(Line) == 0:
                break
            OutputData = OutputData + GenomeNoteData(Line,
                NoteMap, BaseNoteMap, Options)
        debugprint(Options, "gradually cutting off amino acid tones")
        OutputData = OutputData + GenomeNoteData("NNN",
            NoteMap, BaseNoteMap, Options)
        OutputData = OutputData + EndDataTrack(Options)
        OutputFile.write(OutputData[0:Adjustment] + \
            FixedLong(len(OutputData) - Adjustment - LongLength) + \
            OutputData[Adjustment + LongLength:])
        InputFile.close()
        OutputFile.close()
    else:
        OutputFile.write(MidiHeader() + \
            MTrk + FixedLong(0) + MidiTrackCommon(Options))
        while True:
            Line = InputFile.readline()
            if len(Line) == 0:
                break
            OutputFile.write(GenomeNoteData(Line, NoteMap,
                BaseNoteMap, Options))
        debugprint(Options, "gradually cutting off amino acid tones")
        OutputFile.write(GenomeNoteData("NNN", NoteMap, BaseNoteMap, Options))
        OutputFile.write(EndDataTrack(Options))
        InputFile.close()
        OutputFile.close()
        OutputFile = open(Output, "rb+")
        OutputFile.seek(0, 2);
        OutputFileLength = OutputFile.tell()
        OutputFile.seek(Adjustment)
        OutputFile.write(FixedLong(OutputFileLength - Adjustment - LongLength))
        OutputFile.close()

def packgenome():
    verbose = False
    packstring = 'GATCgatcNnMR'
    for file in sys.argv[1:]:
        try:
            input = open(file, "r")
        except:
            sys.stderr.write("cannot open " + file + "\n")
            continue
        header = input.readline()
        match = re.match('(>[^>:\s]+)\s*$', header)
        if match == None:
            sys.stderr.write("invalid header: " + header + "\n")
            continue
        header = match.group(1)
        datastart = input.tell()
        datalength = 0
        while True:
            line = input.readline()
            if len(line) == 0:
                break
            match = re.match('([' + packstring + ']*)\s*$', line)
            if match == None:
                sys.stderr.write("invalid data: " + line + "\n")
                sys.stderr.write("output file will be incomplete\n")
            datalength = datalength + len(match.group(1))
            if verbose:
                sys.stderr.write(str(datalength) + " bytes: " + \
                    match.group(1) + "\n")
        # start compressing the data
        outfile = file + ".2bit"
        try:
            output = open(outfile, "wb")
        except:
            sys.stderr.write("cannot create " + output + "\n")
            continue
        output.write(header + ":1-" + str(datalength) + "\nP")
        # that last 'P' after the newline is to ease coding the unpacking
        # also so that `head -n 1 file.fa.2bit` can be used
        input.seek(datastart)
        count = 0
        chunk = 0
        while count < datalength:
            base = input.read(1)
            match = packstring.find(base)
            if match < 0:
                continue
            chunk = (chunk << 2) | (match % 4)
            count = count + 1
            if (count % 4) == 0:
                output.write(chr(chunk))
                chunk = 0
        if (count % 4) != 0:
            if verbose:
                sys.stderr.write("remaining " + str(count % 4) + \
                    " bytes will be packed as one byte now\n")
            while (count % 4) != 0:
                chunk = chunk << 2
                count = count + 1
            output.write(chr(chunk)) # write final data
        # now write mask data to output
        # [GATC] has mask of 0; [gatc] 1; and [NnMR] 2
        # mask of 3 can be used for something else later(?)
        input.seek(datastart)
        count = 0
        chunk = 0
        while count < datalength:
            base = input.read(1)
            if len(base) == 0:
                sys.stderr.write("... unexpected end of input\n")
                break
            match = packstring.find(base)
            if match < 0:
                if verbose:
                    sys.stderr.write("... skipping: " + str(ord(base)) + ";")
                continue
            chunk = (chunk << 2) | int(match / 4)
            count = count + 1
            if (count % 4) == 0:
                output.write(chr(chunk))
                chunk = 0
        if (count % 4) != 0:
            if verbose:
                sys.stderr.write("remaining " + str(count % 4) + \
                    " bytes will be packed as one byte now\n")
            while (count % 4) != 0:
                chunk = chunk << 2
                count = count + 1
            output.write(chr(chunk)) # write final mask
        output.close
        
def unpackgenome():
    packstring = 'GATCgatcNnMR'
    verbose = False
    for file in sys.argv[1:]:
        try:
            input = open(file, "r")
        except:
            sys.stderr.write("cannot open " + file + "\n")
            continue
        header = input.readline()
        match = re.match('(>[^>:\s]+):\d-(\d+)\s*$', header)
        if match == None:
            sys.stderr.write("invalid header: " + header + "\n")
            continue
        header = match.group(1)
        datalength = int(match.group(2))
        skip = input.read(1) # this is the 'P' we put while compressing
        if skip != 'P': # note that if it _is_ P it doesn't mean it's good
            sys.stderr.write("invalid data byte past header\n")
            continue
        # start uncompressing the data
        outfile = file + ".fa" # makes it "file.fa.2bit.fa"
        try:
            output = open(outfile, "w")
        except:
            sys.stderr.write("cannot create " + output + "\n")
            continue
        output.write(header + "\n")
        datastart = output.tell()
        count = 0
        line = ''
        if verbose:
            sys.stderr.write("datalength = " + str(datalength) + "\n")
        while count < datalength:
            bases = input.read(1)
            if len(bases) == 0:
                sys.stderr.write("...unexpected end of file\n")
                output.close()
                break
            chunk = ord(bases)
            mask = 0xc0 # binary 11000000
            if verbose:
                sys.stderr.write(".")
            for index in range(3, -1, -1):
                base = packstring[(chunk & mask) >> (index * 2)]
                if verbose:
                    sys.stderr.write(base)
                line = line + base
                mask = mask >> 2
                count = count + 1
                if count >= datalength:
                    if verbose:
                        sys.stderr.write("... count >= datalength\n")
                    break
                if len(line) == 50:
                    output.write(line + "\n")
                    line = ''
        if len(line) > 0:
            try:
                output.write(line + "\n") # write final data
            except:
                pass # fuggeddaboudit
        output.close()
        if verbose and False:
            sys.exit(0)
        # now correct output with mask
        # [GATC] has mask of 0; [gatc] 1; and [NnMR] 2
        try:
            output = open(outfile, "r+")
        except:
            sys.stderr.write("cannot correct " + output + "\n")
            continue
        output.seek(datastart)
        count = 0
        chunk = 0
        line = ''
        corrected = ''
        if verbose:
            sys.stderr.write("beginning correction at offset " + \
                str(output.tell()) + "\n")
            sys.stderr.write("datalength = " + str(datalength) + "\n")
        while count < datalength:
            bases = input.read(1)
            if len(bases) == 0:
                sys.stderr.write("... failed correcting output file at " + \
                    str(count) + " out of " + str(datalength) + " bytes.\n")
                try:
                    output.close()
                except:
                    pass
                break # just give up and move on to the next file
            chunk = ord(bases)
            mask = 0xc0 # binary 11000000
            for index in range(3, -1, -1):
                offset = count % 50
                if line == '':
                    linestart = output.tell()
                    line = output.readline()
                    if verbose:
                        sys.stderr.write("line = " + line + "\n")
                    output.seek(linestart)
                base = line[offset:offset + 1] # uncorrected base
                offset = packstring.find(base) # uncorrected offset
                offset = offset + (4 * ((chunk & mask) >> (2 * index)))
                base = packstring[offset:offset + 1] # corrected base
                corrected = corrected + base
                mask = mask >> 2
                count = count + 1
                if count >= datalength:
                    if verbose:
                        sys.stderr.write("... count >= datalength\n")
                    break
                if len(corrected) == 50:
                    try:
                        if verbose:
                            sys.stderr.write("writing " + corrected + \
                                " at offset " + str(linestart) + "\n")
                        output.write(corrected + "\n")
                        output.flush()
                        line = ''
                        corrected = ''
                    except:
                        sys.stderr.write("... incomplete output\n")
                        break
        if len(corrected) > 0:
            if verbose:
                sys.stderr.write("writing final corrected line" + \
                    corrected + "\n")
            try:
                output.write(corrected + "\n") # write final data
            except:
                pass # don't bother even making a fuss
        output.close()
        
def program_name(program_path):
    return re.search('([a-z0-9]+)(.py)?$', program_path)

def main():
    match = program_name(sys.argv[0])
    try:
        program = match.group(1)
    except:
        sys.stderr.write(sys.argv[0] + " is unrecognizable\n")
        sys.exit(1)
    eval(program + "()")

# The following is standard; it allows the script to be used as a library
# with 'import', but runs only when invoked directly
if __name__ == "__main__":
    main()