-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path10_store_tads.py
executable file
·79 lines (62 loc) · 2.45 KB
/
10_store_tads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/python3
#
# This file is part of Progesterone pipeline.
#
# Progesterone pipeline is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Progesterone pipeline is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Progesterone pipeline. If not, see <https://www.gnu.org/licenses/>.
#
# the single input file from endometrial microvascular endothelial cells from
# Job Dekker lab, https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE105710
# the (bed) file with TADs can be found here
# https://www.encodeproject.org/experiments/ENCSR551IPY/
# (under processed data)
from utils.utils import *
from utils.mysqldb import *
#########################################
def main():
species = "human"
assembly = "hg19" # this came from the file metadata, no the file itself
tadfile = "/storage/databases/encode/ENCSR551IPY/ENCFF633ORE.bed"
conf_file = "/home/ivana/.mysql_conf"
########
# references
# I don't see a way to automate this - for example we might have had the pubmed ref, but we don't
geo_exp_id = 'ENCSR551IPY'
geo_file_id = 'ENCFF633ORE'
for prerequisite in [tadfile, conf_file]:
if os.path.exists(prerequisite): continue
print(prerequisite, "not found")
exit()
tads = {}
inf = open(tadfile, "r")
for line in inf:
[chr, start, end] = line.rstrip().split()[:3]
if not chr in tads: tads[chr] = []
tads[chr].append([int(start), int(end)])
db = connect_to_mysql(conf_file)
cursor = db.cursor()
search_db(cursor,"set autocommit=1")
switch_to_db(cursor,'progesterone')
xref_exp_id = store_xref(cursor, 'geo', geo_exp_id)
xref_file_id = store_xref(cursor, 'geo', geo_file_id, parent_id=xref_exp_id)
for chr, regions in tads.items():
for [start, end] in regions:
# store region
fields = {'species':species, 'chromosome':chr, 'assembly':assembly, 'rtype':'tad',
'rfrom':start, 'rto':end, 'xref_id':xref_file_id}
region_id = store_without_checking(cursor, 'regions', fields)
cursor.close()
db.close()
#########################################
if __name__ == '__main__':
main()