diff --git a/examples/gene_workflow.ipynb b/examples/gene_workflow.ipynb index cbedb6ff..f7cba1ea 100644 --- a/examples/gene_workflow.ipynb +++ b/examples/gene_workflow.ipynb @@ -262,97 +262,24 @@ "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
identifieridentifier.sourcetargettarget.sourceDisGeNET
0AGRNHGNC375790NCBI Gene[{'gene_dsi': 0.626, 'gene_dpi': 0.538, 'gene_...
1ALG14HGNC199857NCBI Gene[{'gene_dsi': 0.722, 'gene_dpi': 0.308, 'gene_...
2ALG2HGNC85365NCBI Gene[{'gene_dsi': 0.67, 'gene_dpi': 0.423, 'gene_p...
3CHATHGNC1103NCBI Gene[{'gene_dsi': 0.52, 'gene_dpi': 0.808, 'gene_p...
4CHD8HGNC57680NCBI Gene[{'gene_dsi': 0.656, 'gene_dpi': 0.577, 'gene_...
\n", - "
" - ], - "text/plain": [ - " identifier identifier.source target target.source \\\n", - "0 AGRN HGNC 375790 NCBI Gene \n", - "1 ALG14 HGNC 199857 NCBI Gene \n", - "2 ALG2 HGNC 85365 NCBI Gene \n", - "3 CHAT HGNC 1103 NCBI Gene \n", - "4 CHD8 HGNC 57680 NCBI Gene \n", - "\n", - " DisGeNET \n", - "0 [{'gene_dsi': 0.626, 'gene_dpi': 0.538, 'gene_... \n", - "1 [{'gene_dsi': 0.722, 'gene_dpi': 0.308, 'gene_... \n", - "2 [{'gene_dsi': 0.67, 'gene_dpi': 0.423, 'gene_p... \n", - "3 [{'gene_dsi': 0.52, 'gene_dpi': 0.808, 'gene_p... \n", - "4 [{'gene_dsi': 0.656, 'gene_dpi': 0.577, 'gene_... " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" + "ename": "JSONDecodeError", + "evalue": "Expecting value: line 1 column 1 (char 0)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", + "File \u001b[1;32m~\\anaconda3\\envs\\pyBiodatafuse_dev\\Lib\\site-packages\\requests\\models.py:971\u001b[0m, in \u001b[0;36mResponse.json\u001b[1;34m(self, **kwargs)\u001b[0m\n\u001b[0;32m 970\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 971\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m complexjson\u001b[38;5;241m.\u001b[39mloads(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtext, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 972\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m JSONDecodeError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 973\u001b[0m \u001b[38;5;66;03m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[39;00m\n\u001b[0;32m 974\u001b[0m \u001b[38;5;66;03m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[39;00m\n", + "File \u001b[1;32m~\\anaconda3\\envs\\pyBiodatafuse_dev\\Lib\\json\\__init__.py:346\u001b[0m, in \u001b[0;36mloads\u001b[1;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[0;32m 343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[0;32m 344\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[0;32m 345\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[1;32m--> 346\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _default_decoder\u001b[38;5;241m.\u001b[39mdecode(s)\n\u001b[0;32m 347\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[1;32m~\\anaconda3\\envs\\pyBiodatafuse_dev\\Lib\\json\\decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[1;34m(self, s, _w)\u001b[0m\n\u001b[0;32m 333\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[0;32m 334\u001b[0m \u001b[38;5;124;03mcontaining a JSON document).\u001b[39;00m\n\u001b[0;32m 335\u001b[0m \n\u001b[0;32m 336\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m--> 337\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw_decode(s, idx\u001b[38;5;241m=\u001b[39m_w(s, \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mend())\n\u001b[0;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n", + "File \u001b[1;32m~\\anaconda3\\envs\\pyBiodatafuse_dev\\Lib\\json\\decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[1;34m(self, s, idx)\u001b[0m\n\u001b[0;32m 354\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m--> 355\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpecting value\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, err\u001b[38;5;241m.\u001b[39mvalue) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 356\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj, end\n", + "\u001b[1;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[5], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m api_key \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m0209751bfa7b6a981a8f5fb5f062313067ecd36c\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# TODO: add your key\u001b[39;00m\n\u001b[0;32m 2\u001b[0m params \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msource\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCURATED\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mformat\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson\u001b[39m\u001b[38;5;124m\"\u001b[39m} \u001b[38;5;66;03m# only curated data\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m disgenet_result, disgenet_metadata \u001b[38;5;241m=\u001b[39m disgenet\u001b[38;5;241m.\u001b[39mget_gene_disease(\n\u001b[0;32m 4\u001b[0m bridgedb_df\u001b[38;5;241m=\u001b[39mbridgdb_df, api_key\u001b[38;5;241m=\u001b[39mapi_key, params\u001b[38;5;241m=\u001b[39mparams\n\u001b[0;32m 5\u001b[0m )\n\u001b[0;32m 6\u001b[0m disgenet_result\u001b[38;5;241m.\u001b[39mhead()\n", + "File \u001b[1;32m~\\Desktop\\pyBiodatafuse\\src\\pyBiodatafuse\\annotators\\disgenet.py:77\u001b[0m, in \u001b[0;36mget_gene_disease\u001b[1;34m(bridgedb_df, api_key, params)\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[38;5;66;03m# Get all the diseases associated with genes for the current chunk\u001b[39;00m\n\u001b[0;32m 76\u001b[0m gda_response \u001b[38;5;241m=\u001b[39m s\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mapi_host\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/gda/gene/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchunked_input\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, params\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m---> 77\u001b[0m chunk_output \u001b[38;5;241m=\u001b[39m gda_response\u001b[38;5;241m.\u001b[39mjson()\n\u001b[0;32m 78\u001b[0m disgenet_output\u001b[38;5;241m.\u001b[39mextend(chunk_output)\n\u001b[0;32m 80\u001b[0m \u001b[38;5;66;03m# Record the end time\u001b[39;00m\n", + "File \u001b[1;32m~\\anaconda3\\envs\\pyBiodatafuse_dev\\Lib\\site-packages\\requests\\models.py:975\u001b[0m, in \u001b[0;36mResponse.json\u001b[1;34m(self, **kwargs)\u001b[0m\n\u001b[0;32m 971\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m complexjson\u001b[38;5;241m.\u001b[39mloads(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtext, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 972\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m JSONDecodeError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 973\u001b[0m \u001b[38;5;66;03m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[39;00m\n\u001b[0;32m 974\u001b[0m \u001b[38;5;66;03m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[39;00m\n\u001b[1;32m--> 975\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RequestsJSONDecodeError(e\u001b[38;5;241m.\u001b[39mmsg, e\u001b[38;5;241m.\u001b[39mdoc, e\u001b[38;5;241m.\u001b[39mpos)\n", + "\u001b[1;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)" + ] } ], "source": [ @@ -370,166 +297,15 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[{'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C3808739',\n", - " 'disease_name': 'MYASTHENIC SYNDROME, CONGENITAL, 8',\n", - " 'disease_class': None,\n", - " 'disease_class_name': None,\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Disease or Syndrome',\n", - " 'score': 0.8,\n", - " 'ei': 1.0,\n", - " 'el': None,\n", - " 'year_initial': 2009.0,\n", - " 'year_final': 2014.0,\n", - " 'source': 'CURATED'},\n", - " {'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C0751882',\n", - " 'disease_name': 'Myasthenic Syndromes, Congenital',\n", - " 'disease_class': 'C16;C10',\n", - " 'disease_class_name': ' Congenital, Hereditary, and Neonatal Diseases and Abnormalities; Nervous System Diseases',\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Disease or Syndrome',\n", - " 'score': 0.65,\n", - " 'ei': 1.0,\n", - " 'el': 'strong',\n", - " 'year_initial': 2009.0,\n", - " 'year_final': 2020.0,\n", - " 'source': 'CURATED'},\n", - " {'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C0751883',\n", - " 'disease_name': 'Congenital Myasthenic Syndromes, Postsynaptic',\n", - " 'disease_class': 'C16;C10',\n", - " 'disease_class_name': ' Congenital, Hereditary, and Neonatal Diseases and Abnormalities; Nervous System Diseases',\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Disease or Syndrome',\n", - " 'score': 0.5,\n", - " 'ei': 1.0,\n", - " 'el': None,\n", - " 'year_initial': 2009.0,\n", - " 'year_final': 2012.0,\n", - " 'source': 'CURATED'},\n", - " {'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C0751884',\n", - " 'disease_name': 'Congenital Myasthenic Syndromes, Presynaptic',\n", - " 'disease_class': 'C16;C10',\n", - " 'disease_class_name': ' Congenital, Hereditary, and Neonatal Diseases and Abnormalities; Nervous System Diseases',\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Disease or Syndrome',\n", - " 'score': 0.5,\n", - " 'ei': 1.0,\n", - " 'el': None,\n", - " 'year_initial': 2009.0,\n", - " 'year_final': 2012.0,\n", - " 'source': 'CURATED'},\n", - " {'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C0023467',\n", - " 'disease_name': 'Leukemia, Myelocytic, Acute',\n", - " 'disease_class': 'C04',\n", - " 'disease_class_name': ' Neoplasms',\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Neoplastic Process',\n", - " 'score': 0.3,\n", - " 'ei': 1.0,\n", - " 'el': None,\n", - " 'year_initial': 2007.0,\n", - " 'year_final': 2007.0,\n", - " 'source': 'CURATED'},\n", - " {'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C0026998',\n", - " 'disease_name': 'Acute Myeloid Leukemia, M1',\n", - " 'disease_class': 'C04',\n", - " 'disease_class_name': ' Neoplasms',\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Neoplastic Process',\n", - " 'score': 0.3,\n", - " 'ei': 1.0,\n", - " 'el': None,\n", - " 'year_initial': 2007.0,\n", - " 'year_final': 2007.0,\n", - " 'source': 'CURATED'},\n", - " {'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C0751885',\n", - " 'disease_name': 'Myasthenic Syndromes, Congenital, Slow Channel',\n", - " 'disease_class': 'C16;C10',\n", - " 'disease_class_name': ' Congenital, Hereditary, and Neonatal Diseases and Abnormalities; Nervous System Diseases',\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Disease or Syndrome',\n", - " 'score': 0.3,\n", - " 'ei': nan,\n", - " 'el': None,\n", - " 'year_initial': nan,\n", - " 'year_final': nan,\n", - " 'source': 'CURATED'},\n", - " {'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C1850792',\n", - " 'disease_name': 'Congenital myasthenic syndrome ib',\n", - " 'disease_class': 'C16;C10',\n", - " 'disease_class_name': ' Congenital, Hereditary, and Neonatal Diseases and Abnormalities; Nervous System Diseases',\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Disease or Syndrome',\n", - " 'score': 0.3,\n", - " 'ei': nan,\n", - " 'el': 'limited',\n", - " 'year_initial': nan,\n", - " 'year_final': nan,\n", - " 'source': 'CURATED'},\n", - " {'gene_dsi': 0.626,\n", - " 'gene_dpi': 0.538,\n", - " 'gene_pli': 5.4727e-07,\n", - " 'protein_class': None,\n", - " 'protein_class_name': None,\n", - " 'diseaseid': 'C1879321',\n", - " 'disease_name': 'Acute Myeloid Leukemia (AML-M2)',\n", - " 'disease_class': 'C04',\n", - " 'disease_class_name': ' Neoplasms',\n", - " 'disease_type': 'disease',\n", - " 'disease_semantic_type': 'Neoplastic Process',\n", - " 'score': 0.3,\n", - " 'ei': 1.0,\n", - " 'el': None,\n", - " 'year_initial': 2007.0,\n", - " 'year_final': 2007.0,\n", - " 'source': 'CURATED'}]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'disgenet_result' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m disgenet_result[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDisGeNET\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;241m0\u001b[39m]\n", + "\u001b[1;31mNameError\u001b[0m: name 'disgenet_result' is not defined" + ] } ], "source": [ @@ -589,33 +365,33 @@ " \n", " \n", " 1\n", - " AGRN\n", + " ALG14\n", " HGNC\n", - " A0A494C0G5\n", + " Q96F25\n", " Uniprot-TrEMBL\n", " [{'label': nan, 'InChIKey': nan, 'SMILES': nan...\n", " \n", " \n", " 2\n", - " AGRN\n", + " ALG2\n", " HGNC\n", - " A0A494C1I6\n", + " A0A024R184\n", " Uniprot-TrEMBL\n", " [{'label': nan, 'InChIKey': nan, 'SMILES': nan...\n", " \n", " \n", " 3\n", - " AGRN\n", + " CHAT\n", " HGNC\n", - " O00468\n", + " A0A1W2PP46\n", " Uniprot-TrEMBL\n", " [{'label': nan, 'InChIKey': nan, 'SMILES': nan...\n", " \n", " \n", " 4\n", - " ALG14\n", + " CHD8\n", " HGNC\n", - " Q96F25\n", + " A0A2R8Y4P3\n", " Uniprot-TrEMBL\n", " [{'label': nan, 'InChIKey': nan, 'SMILES': nan...\n", " \n", @@ -626,10 +402,10 @@ "text/plain": [ " identifier identifier.source target target.source \\\n", "0 AGRN HGNC A0A087X208 Uniprot-TrEMBL \n", - "1 AGRN HGNC A0A494C0G5 Uniprot-TrEMBL \n", - "2 AGRN HGNC A0A494C1I6 Uniprot-TrEMBL \n", - "3 AGRN HGNC O00468 Uniprot-TrEMBL \n", - "4 ALG14 HGNC Q96F25 Uniprot-TrEMBL \n", + "1 ALG14 HGNC Q96F25 Uniprot-TrEMBL \n", + "2 ALG2 HGNC A0A024R184 Uniprot-TrEMBL \n", + "3 CHAT HGNC A0A1W2PP46 Uniprot-TrEMBL \n", + "4 CHD8 HGNC A0A2R8Y4P3 Uniprot-TrEMBL \n", "\n", " transporter_inhibitor \n", "0 [{'label': nan, 'InChIKey': nan, 'SMILES': nan... \n", @@ -660,19 +436,19 @@ "[{'label': 'Cefepime',\n", " 'InChIKey': 'HVFLCNVBZFFHBT-UHFFFAOYSA-O',\n", " 'SMILES': 'CON=C(C(=O)NC1C(=O)N2C(C(=O)O)=C(C[N+]3(C)CCCC3)CSC12)c1csc(N)n1',\n", - " 'pubchem_compound_id': '2623',\n", + " 'pubchem_compound_id': 2623,\n", " 'molmedb_id': 'MM16967',\n", " 'source_doi': 'doi:10.1074/jbc.275.3.1699',\n", - " 'source_pmid': '10636865',\n", + " 'source_pmid': 10636865,\n", " 'chebi_id': nan,\n", " 'drugbank_id': nan},\n", " {'label': 'Cephaloridine',\n", " 'InChIKey': 'CZTQZXZIADLWOZ-UHFFFAOYSA-O',\n", " 'SMILES': 'O=C(Cc1cccs1)NC1C(=O)N2C(C(=O)O)=C(C[n+]3ccccc3)CSC12',\n", - " 'pubchem_compound_id': '5773',\n", + " 'pubchem_compound_id': 5773,\n", " 'molmedb_id': 'MM00638',\n", " 'source_doi': 'doi:10.1074/jbc.275.3.1699',\n", - " 'source_pmid': '10636865',\n", + " 'source_pmid': 10636865,\n", " 'chebi_id': '3537',\n", " 'drugbank_id': 'DB09008'}]" ] @@ -683,7 +459,7 @@ } ], "source": [ - "inhibitor_df[\"transporter_inhibitor\"][72]" + "inhibitor_df[\"transporter_inhibitor\"][13]" ] }, { @@ -697,7 +473,19 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'disgenet_result' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[9], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m combined_df \u001b[38;5;241m=\u001b[39m combine_sources([disgenet_result, inhibitor_df])\n", + "\u001b[1;31mNameError\u001b[0m: name 'disgenet_result' is not defined" + ] + } + ], "source": [ "combined_df = combine_sources([disgenet_result, inhibitor_df])" ] @@ -708,103 +496,27 @@ "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
identifieridentifier.sourcetargettarget.sourceDisGeNETtransporter_inhibitor
0AGRNHGNC375790NCBI Gene[{'gene_dsi': 0.626, 'gene_dpi': 0.538, 'gene_...[{'label': nan, 'InChIKey': nan, 'SMILES': nan...
1ALG14HGNC199857NCBI Gene[{'gene_dsi': 0.722, 'gene_dpi': 0.308, 'gene_...[{'label': nan, 'InChIKey': nan, 'SMILES': nan...
2ALG2HGNC85365NCBI Gene[{'gene_dsi': 0.67, 'gene_dpi': 0.423, 'gene_p...[{'label': nan, 'InChIKey': nan, 'SMILES': nan...
3CHATHGNC1103NCBI Gene[{'gene_dsi': 0.52, 'gene_dpi': 0.808, 'gene_p...[{'label': nan, 'InChIKey': nan, 'SMILES': nan...
\n", - "
" - ], - "text/plain": [ - " identifier identifier.source target target.source \\\n", - "0 AGRN HGNC 375790 NCBI Gene \n", - "1 ALG14 HGNC 199857 NCBI Gene \n", - "2 ALG2 HGNC 85365 NCBI Gene \n", - "3 CHAT HGNC 1103 NCBI Gene \n", - "\n", - " DisGeNET \\\n", - "0 [{'gene_dsi': 0.626, 'gene_dpi': 0.538, 'gene_... \n", - "1 [{'gene_dsi': 0.722, 'gene_dpi': 0.308, 'gene_... \n", - "2 [{'gene_dsi': 0.67, 'gene_dpi': 0.423, 'gene_p... \n", - "3 [{'gene_dsi': 0.52, 'gene_dpi': 0.808, 'gene_p... \n", - "\n", - " transporter_inhibitor \n", - "0 [{'label': nan, 'InChIKey': nan, 'SMILES': nan... \n", - "1 [{'label': nan, 'InChIKey': nan, 'SMILES': nan... \n", - "2 [{'label': nan, 'InChIKey': nan, 'SMILES': nan... \n", - "3 [{'label': nan, 'InChIKey': nan, 'SMILES': nan... " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'combined_df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[10], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m combined_df\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m4\u001b[39m)\n", + "\u001b[1;31mNameError\u001b[0m: name 'combined_df' is not defined" + ] } ], "source": [ "combined_df.head(4)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -823,9 +535,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.8" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/pyBiodatafuse/annotators/molmedb.py b/src/pyBiodatafuse/annotators/molmedb.py index 088ca952..1d7ead7f 100644 --- a/src/pyBiodatafuse/annotators/molmedb.py +++ b/src/pyBiodatafuse/annotators/molmedb.py @@ -8,6 +8,7 @@ from string import Template from typing import Tuple +import numpy as np import pandas as pd from SPARQLWrapper import JSON, SPARQLWrapper @@ -93,6 +94,7 @@ def get_gene_mol_inhibitor(bridgedb_df: pd.DataFrame): col_name="transporter_inhibitor", ) + # if mappings exist but SPARQL returns empty response if (not merged_df.empty) and merged_df["transporter_inhibitor"][0] is None: merged_df.drop_duplicates(subset=["identifier", "transporter_inhibitor"], inplace=True) elif not merged_df.empty: @@ -111,14 +113,16 @@ def get_gene_mol_inhibitor(bridgedb_df: pd.DataFrame): identifiers = merged_df["identifier"].unique() for identifier in identifiers: if merged_df.loc[merged_df["identifier"] == identifier].shape[0] > 1: - mask = merged_df.apply( - lambda x, id=identifier: ( - all(pd.isna(v) for v in d.values()) and x["identifier"] == id - for d in x["transporter_inhibitor"] - ), - axis=1, + mask = merged_df["transporter_inhibitor"].apply( + lambda lst: all( + [ + all([isinstance(val, float) and np.isnan(val) for val in dct.values()]) + for dct in lst + ] + ) ) - merged_df.drop(merged_df[mask].index, inplace=True) + mask2 = merged_df["identifier"].apply(lambda x, id=identifier : x == id) + merged_df.drop(merged_df[mask & mask2].index, inplace=True) # set default order to response dictionaries to keep output consistency merged_df["transporter_inhibitor"] = merged_df["transporter_inhibitor"].apply( @@ -248,7 +252,7 @@ def get_mol_gene_inhibitor(bridgedb_df: pd.DataFrame) -> Tuple[pd.DataFrame, dic def int_response_value_types(resp_list: list, key_list: list): - """Change values in response dictionaries to int to stay consistent woth other Annotators. + """Change values in response dictionaries to int to stay consistent with other Annotators. :param: resp_list: list of response dictionaries. :param: key_list: list of keys to change to int.