-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetMcSequence.py
158 lines (145 loc) · 6.02 KB
/
getMcSequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
# encoding: utf-8
"""
sub_seq_getter.py
Created by Brant Faircloth on 2009-12-17.
Copyright (c) 2009 Brant Faircloth. All rights reserved.
"""
import os, sys, pdb, time, MySQLdb, cPickle, optparse, ConfigParser
from Bio.SeqIO import QualityIO
def interface():
usage = "usage: %prog [options]"
p = optparse.OptionParser(usage)
p.add_option('--configuration', '-c', dest = 'conf', action='store', \
type='string', default = None, help='The path to the configuration file.', \
metavar='FILE')
p.add_option('--species', '-s', dest = 'species', action='store', \
type='string', default = None, help='The species for which to grab sequence')
p.add_option('--trimmed', '-t', dest = 'trimmed', action='store_true', \
default = False, help='Get trimmed sequence')
p.add_option('--msat', '-m', dest = 'msat', action='store_true', \
default = False, help='Get microsat containing sequence')
# p.add_option('--all', '-a', dest = 'all', action='store_true',\
#default = False, help='Get all sequences')
p.add_option('--byspecies', '-b', dest = 'byspecies', action='store_true', \
default = False, help='Get sequences by species')
p.add_option('--gigantic','-g', dest='gigantic', action='store_true', \
default = False, help='Store all sequence in one big file')
p.add_option('--fasta', '-f', dest='fasta', action='store', \
type='string', default = False, help='Elements you want in the fasta header\
for each read - separate with a %sign. Elements include cluster, \
mid, name, id')
(options,arg) = p.parse_args()
if not options.conf:
p.print_help()
sys.exit(2)
if not os.path.isfile(options.conf):
print "You must provide a valid path to the configuration file."
p.print_help()
sys.exit(2)
return options, arg
def not_trimmed(cur, conf, options, sequence, qual):
cur.execute('SELECT name FROM sequence WHERE cluster = %s', (options.species))
data = cur.fetchall()
dataset = set()
for d in data:
dataset.add(d[0])
seqs = QualityIO.PairedFastaQualIterator(
open(conf.get('Input','sequence'), "rU"),
open(conf.get('Input','qual'), "rU"))
try:
while seqs:
record = seqs.next()
if record.name in dataset:
sequence.write('%s' % record.format('fasta'))
qual.write('%s' % record.format('qual'))
except StopIteration:
pass
qual.close()
sequence.close()
def trimmed(cur, conf, species, msat, sequence, qual):
#pdb.set_trace()
if msat == 'True':
cur.execute('''SELECT record FROM sequence where cluster = %s and msat = 1''', (species))
elif msat == 'False':
cur.execute('''SELECT record FROM sequence where cluster = %s''', (species))
for p_rec in cur.fetchall():
record = cPickle.loads(p_rec[0])
if len(record) > 0:
sequence.write('%s' % record.format('fasta'))
qual.write('%s' % record.format('qual'))
qual.close()
sequence.close()
def main():
start_time = time.time()
options, arg = interface()
print 'Started: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(start_time))
conf = ConfigParser.ConfigParser()
conf.read(options.conf)
conn = MySQLdb.connect(
user=conf.get('Database','USER'),
passwd=conf.get('Database','PASSWORD'),
db=conf.get('Database','DATABASE')
)
cur = conn.cursor()
# get all sequences where cluster != null
# put them in monolithic file
# change the sequence header
if options.gigantic:
outfa = open('monolithic_PARSED.fsa','w')
outqual = open('monolithic_PARSED.qual','w')
if options.trimmed:
#pdb.set_trace()
if options.fasta:
fs = options.fasta.split('%')
fs = ', '.join(fs)
query = '''SELECT %s, record FROM sequence WHERE cluster IS NOT NULL and trimmed_len > 0''' % fs
results = cur.execute(query)
else:
results = cur.execute('''SELECT record FROM sequence WHERE cluster IS NOT NULL and trimmed_len > 0''')
while 1:
result = cur.fetchone()
if not result:
break
record = cPickle.loads(result[-1])
var = len(result) - 1
if options.fasta:
record.id = '_'.join([str(i) for i in result[0:var]])
record.name = ''
record.description = ''
#pdb.set_trace()
outfa.write('%s' % record.format('fasta'))
outqual.write('%s' % record.format('qual'))
outfa.close()
outqual.close()
elif options.all == 'True' and options.trimmed == 'True':
# get all the cluster names from the dbase
cur.execute('SELECT distinct(cluster) from sequence')
clusters = cur.fetchall()
for c in clusters:
if c[0] != None:
#pdb.set_trace()
out = c[0] + '_PARSED.fsa'
q = c[0] + '_PARSED.qual'
sequence = open(out,'w')
qual = open(q,'w')
trimmed(cur, conf, c[0], options.msat, sequence, qual)
elif options.trimmed == 'False':
out = options.species + '_PARSED.fsa'
q = options.species + '_PARSED.qual'
sequence = open(out,'w')
qual = open(q,'w')
not_trimmed(cur, conf, options.species, options.msat, sequence, qual)
elif options.trimmed == 'True':
out = options.species + '_PARSED.fsa'
q = options.species + '_PARSED.qual'
sequence = open(out,'w')
qual = open(q,'w')
trimmed(cur, conf, options, sequence, qual)
cur.close()
conn.close()
end_time = time.time()
print 'Ended: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(end_time))
print '\nTime for execution: ', (end_time - start_time)/60, 'minutes'
if __name__ == '__main__':
main()