#!/usr/local/bin/python """ Author: Shop Mallick Contact: shop.mallick@gmail.com Date: Jun 1 2020 Usage: zcat mtDB.gz | python mtdna_uncompress_v2.py [options] Example: (a) Extract all mitogenomes: zcat mtDB.gz | python mtdna_uncompress_v2.py (b) Extract a specific single mitogenome, with id 'I0000' (c) Extract a set of mitogenomes with ids in the list 'I0000','I000a', 'I000b' (d) Extract a set of mitogenomes with ids in the file: 'myfile.ids' Notes: uses compression scheme developed by Nick Patterson in cTools from the SGDP project. Update: [2021_08_Feb]: data structure simplified, empty sequencess stripped """ import sys, os from optparse import OptionParser usage = "usage: zcat mtDB.gz | python %prog [options] " parser = OptionParser(usage=usage, version="%prog v2") parser.add_option( "-i", "--id", action="store", type="string", dest="idset", help="extract mitogenomes for a set of ids",default=False) parser.add_option( "-f", "--file", action="store", type="string", dest="idfile", help="extract a list of mitogenomes from file (will only list the ones in idset, if specified)",default=False) parser.add_option( "-c", "--check", action="store_true", dest="check", help="extract a list of mitogenomes from file (will only list the ones in idset, if specified)",default=False) (options, args) = parser.parse_args() if options.idset: nids= len( options.idset.split(",")) sys.stderr.write( "# looking for %i ID(s): idset=%s\n" % (nids, options.idset )) rsrs = """ GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCCCATC CCATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTAAAT GTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCA AACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCA AACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTT TATCTTTTGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACATT ATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCCGC CCATCCTACCCAGCACACACACNNCGCTGCTAACCCCATACCCCGAACCA ACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAA GCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAAT AGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAA GCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGG ACAAGCATCAAGCACGCAACAATGCAGCTCAAAACGCTTAGCCTAGCCAC ACCCCCACGGGAAACAGCAGTGATAAACCTTTAGCAATAAACGAAAGTTT AACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGC GGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTA GATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACT CCAGTTGACACAAAATAAACTACGAAAGTGGCTTTAACATATCTGAACAC ACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCC TAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGC CACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGG AGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTC AGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAA GCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTG GCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTAT GAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTGAGAGTAGAGTGC TTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCCTC AAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATAGA GGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAA CCAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCA ACTTAACTTGACCGCTCTGAGCTAAACCTAGCCCCAAACCCACTCCACCT TACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCG ATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATG AAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTTCT GCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGA CCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGT CTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTAC CGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTTTA AATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTC CAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGT AAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAA GCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGA ACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATG TTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGA TTAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCAAC AAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGA AAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTA CCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCCCA GTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATA ATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTC AGCTGTCTCTTACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCG GGCATGACACAGCAAGACGAGAAGACCCTATGGAGCTTTAATTTATTAAT GCAAACAATACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATT AAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGT ACATGCTAAGACTTCACCAGTCAAAGCGAACTACCATACTCAATTGATCC AATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCT ATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCA GGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATT AAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCT ATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCT ACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATT ATACCCACACCCACCCAAGAACAGGGTTTGTTAAGATGGCAGAGCCCGGT AATCGCATAAAACTTAAAACTTTACAGTCAGAGGTTCAATTCCTCTTCTT AACAACATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAAT CGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATAC AACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACCC TTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCAC ATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCG CTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTTAACCTC AACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACTC AATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCG CACTGCGAGCAGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATC ATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCCT TATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTGG CCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTC GACCTTGCCGAAGGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAATA CGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATTA TTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGAC GCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTACT TCTGACCTCCCTGTTCTTATGAATTCGAACAGCATACCCCCGATTCCGCT ACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTA GCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCC CCCTCAAACCTAAGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGTA AATAATAGGAGTTTAAACCCCCTTATTTCTAGGACTATGAGAATCGAACC CATCCCTGAGAATCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTA AAGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGG TTATACCCTTCCCGTACTAATTAATCCCCTGGCCCAACCCGTCATCTACT CTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTT TTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCT AACCAAAAAAATAAACCCTCGTTCCACAGAAGCTGCCATCAAGTATTTCC TCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTATCCTCTTCAAC AATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACTC ATCATTAATAATCATAATGGCTATAGCAATAAAACTAGGAATAGCCCCCT TTCACTTCTGAGTCCCAGAGGTTACCCAAGGCACCCCTCTGACATCCGGC CTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCA AATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTAT CCATCATAGCAGGCAGTTGAGGTGGATTAAACCAAACCCAGCTACGCAAA ATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAATAGCAGTTCT ACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCC TAACTACTACCGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACC CTACTACTATCTCGCACCTGAAACAAGCTAACATGACTAACACCCTTAAT TCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTTT TGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATC ATCCCCACCATCATAGCCACCATCACCCTCCTTAACCTCTACTTCTACCT ACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATATCTAACAACG TAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCC ACACTCATCGCCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACT AATAATCTTATAGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAGC CCTCAGTAAGTTGCAATACTTAATTTCTGTAACAGCTAAGGACTGCAAAA CCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAG CCCTTACTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGCT AAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCCGCCGGGAAAA AAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAA TTCAATATGAAAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTGT CTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACCTCACCCCCA CTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGG AACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCTC TAAGCCTCCTTATTCGAGCCGAGCTGGGCCAGCCAGGCAACCTTCTAGGT AACGACCACATCTACAACGTTATCGTCACAGCCCATGCATTTGTAATAAT CTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTAG TTCCCCTAATAATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAAC ATAAGCTTCTGACTCTTACCTCCCTCTCTCCTACTCCTGCTCGCATCTGC TATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAG CAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCC TTACACCTAGCAGGTGTCTCCTCTATCTTAGGGGCCATCAATTTCATCAC AACAATTATCAATATAAAACCCCCTGCCATAACCCAATACCAAACGCCCC TCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTC CCAGTCCTAGCTGCTGGCATCACTATACTACTAACAGACCGCAACCTCAA CACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCTATACCAAC ACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCA GGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGA ACCATTTGGATACATAGGTATGGTCTGAGCTATGATATCAATTGGCTTCC TAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGGAATAGACGTA GACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCAC CGGCGTCAAAGTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATGA AATGATCTGCTGCAGTGCTCTGAGCCCTAGGATTCATCTTTCTTTTCACC GTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGT ACTACACGACACGTACTACGTTGTAGCTCACTTCCACTATGTCCTATCAA TAGGAGCTGTATTTGCCATCATAGGAGGCTTCATTCACTGATTTCCCCTA TTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCCATTTCGCTAT CATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGCC TATCCGGAATGCCCCGACGTTACTCGGACTACCCCGATGCATACACCACA TGAAATATCCTATCATCTGTAGGCTCATTCATTTCTCTAACAGCAGTAAT ATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCC TAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCCC CCACCCTACCACACATTCGAAGAACCCGTATACATAAAATCTAGACAAAA AAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCCAACCCCATGGCC TCCATGACTTTTTCAAAAAGATATTAGAAAAACCATTTCATAACTTTGTC AAAGTTAAATTATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCG CAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCAC CTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCC TGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATC TCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCAT CCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACG AGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTAC TGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACAT ACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTG ACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACA TCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAAC AGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGAC CGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGT TTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGG GCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGT AAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAGAACCAACA CCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCAT AATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATAT TAAACACAAACTACCACTTACCTCCCTCACCAAAGCCCATAAAAATAAAA AATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCA TTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCT ATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATCAACAACCGAC TAATTACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGATA GCCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTT AATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCAT TTACACCAACCACCCAACTATCTATAAACCTAGCCATGGCCATCCCCTTA TGAGCGGGCGCAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATGCCCT AGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAG TTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTA CGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGG AAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCA TCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTA ATCCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACAA CACATAATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCCC ATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAG CCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTA ACCAACACACTAACCATATACCAATGATGGCGCGATGTAACACGAGAAAG CACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACG GGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTC TGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAACTAGGAGGGCA CTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTCC TAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCAC CATAGTCTAATAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTAT TACAATTTTACTGGGTCTCTATTTTACCCTCCTACAAGCCTCAGAGTACT TCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTT GTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCT CACTATCTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACATC ACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGTAGATGTGGTT TGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTAG TATAAATAGTACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAAA AAAGAGTAATAAACTTCGCCTTAATTTTAATAATCAACACCCTCCTAGCC TTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACAT AGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCCC GCGTCCCTTTCTCCATAAAATTCTTCTTAGTAGCTATTACCTTCTTATTA TTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAGCCCTACAAAC AACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCC TAGCCCTAAGTCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAGCC GAATTGGTATATAGTTTAAACAAAACGAATGATTTCGACTCATTAAATTA TGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTAG CATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATA TCCTCCCTACTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAGC TACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAATATTGTGCCTA TTGCCATACTAGTTTTTGCCGCCTGCGAAGCAGCGGTAGGCCTAGCCCTA CTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAAA CCTACTCCAATGCTAAAACTAATCGTCCCAACAATTATATTACTACCACT GACATGACTCTCCAAAAAACACATAATTTGAATCAACACAACCACCCACA GCCTAATTATTAGCATCATCCCCCTACTATTTTTTAACCAAATCAACAAC AACCTATTTAGCTGCTCCCCAACCTTTTCCTCCGACCCCCTAACAACCCC CCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATCATGGCAAGCC AACGCCACTTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCT ATACTAATCTCCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGA ACTAATCATATTTTATATCTTCTTCGAAACCACACTTATCCCCACCTTGG CTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACA TACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACT AATTTACACTCACAACACCCTAGGCTCACTAAACATTCTACTACTCACTC TCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTTAATATGACTA GCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTT ATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTAC TTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATACGCCTCACA CTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTACT ATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACAG ACCTAAAATCGCTCATTGCATACTCTTCAATCAGCCACATAGCCCTCGTA GTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCAT TCTCATAATCGCCCACGGACTTACATCCTCATTACTATTCTGCCTAGCAA ACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGA CTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCCT CGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTG TGCTAGTAACCACATTCTCCTGATCAAATATCACTCTCCTACTTACAGGA CTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAAC ACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTCA CACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTA TCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTT AACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTTACGACCCCTTA TTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAAC AACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAG GCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACAC TACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCA CCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCC ATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCAT GTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCC AAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATA TTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACT GTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTAC TCATTTTCCTAATTACCATACTAATCTTAGTTACCGCTAACAACCTATTC CAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCAT CAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAA TCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATGA TTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAA CGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAG GCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAA GGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGC AGGAGTCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCAC TAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCA GCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTC CACTTCAAGTCAACTAGGACTCATAGTAGTTACAATCGGCATCAACCAAC CACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATA CTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGA TATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCT CCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGT TTCTATTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAAA CGCCTGAGCCCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCCT ATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCT ACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAA ACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAACA TTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACTC ACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAA CTACCTAACCAACAAACTTAAAATAAAATCCCCACTATGCACATTTTATT TCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCCC TATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCT AACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCT CCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTTC CTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAACC TATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGTT CAACCAGTAACTACTACTAATCAACGCCCATAATCATACAAAGCCCCCGC ACCAATAGGATCCTCCCGAATCAACCCTGACCCCTCTCCTTCATAAATTA TTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATAC TCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAAC ACTCACCAAGACCTCAACCCCTGACCCCCATGCCTCAGGATACTCCTCAA TAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCCCCCTAAATAA ATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAAT AACACACCCGACCACACCGCTAACAATCAATACTAAACCCCCATAAATAG GAGAAGGCTTAGAAGAAAACCCCACAAACCCCATTACTAAACCCACACTC AACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGAC CAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATGA CCCCAATACGCAAAATTAACCCCCTAATAAAATTAATTAACCACTCATTC ATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTC ACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAG CCATGCACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCAC ATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCAA TGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCC TATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATC CTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTG AGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAACTTACTATCCG CCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTAC TCAGTAGACAGTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTT GCCCTTCATTATTGCAGCCCTAGCAGCACTCCACCTCCTATTCTTGCACG AAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATC ACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCTT CCTTCTCTCCTTAATGACATTAACACTATTCTCACCAGACCTCCTAGGCG ACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCCTCCCCACATC AAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCC TAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAG CAATAATCCCCATCCTCCATATATCCAAACAACAAAGCATAATATTTCGC CCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTCT AACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGAC AAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAACT ATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTCCTTGTAGTAT AAACTAATACACCAGTCTTGTAAACCGGAGATGAAAACCTTTTTCCAAGG ACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAG ATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGT ACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACA TTACTGCCAGCCACCATGAATATTGTACAGTACCATAAATACTTGACCAC CTGTAGTACATAAAAACCCAATCCACATCAAAACCCTCCCCCCATGCTTA CAAGCAAGTACAGCAATCAACCTTCAACTGTCACACATCAACTGCAACTC CAAAGCCACCCCTCACCCACTAGGATATCAACAAACCTACCCACCCTTAA CAGTACATAGCACATAAAGCCATTTACCGTACATAGCACATTACAGTCAA ATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGAC CACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCG CTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACAT CTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCT TAAATAAGACATCACGATG""".replace("\n","") seq_in=[] get_ids={} if options.idset: [ get_ids.setdefault(x,1) for x in options.idset.split(',') ] if options.idfile: print "# reading IDs from file" if not os.path.exists( options.idfile ): print "Yikes: no id file: .%s. - quitting" exit fh = open( options.idfile ) for l in fh.xreadlines(): l = l.strip() get_ids[l]=1 n_get_ids=len(get_ids) hit={} ndumped=0 for l in sys.stdin: l = l.strip() v = l.split("\t") idd = v[0].replace(">","") flag_dump=0 if True: # dump? if get_ids.has_key( idd ) or n_get_ids==0: if options.check: print "found: %s" % idd continue seq_in = v[1] seq_in = "%s%s" % ( seq_in, "-" * ( len(rsrs)-len(seq_in) )) x = [ x[0].replace( 'Q', x[1] ) for x in zip( seq_in, rsrs ) ] x = "".join(x) print ">%s" % idd print "\n".join( [ x[i:i+60] for i in range( 0,len(x),60 )] ) ndumped+=1 hit[ idd ] = 1 continue # report nhit=len(hit) sys.stderr.write( "# Number of ( ids_requested, scanned, outputted ) = (%i, %i, %i)\n" % ( n_get_ids, nhit, ndumped )) if n_get_ids>0 and nhit