diff --git a/src/cs598_dlh_final_project.ipynb b/src/cs598_dlh_final_project.ipynb index 1cb485799937731841e3bbc3a375323dbb0ea1ab..05528cc443d12a80d8952fc616b587b1c2a29593 100644 --- a/src/cs598_dlh_final_project.ipynb +++ b/src/cs598_dlh_final_project.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 38, "id": "5da368cd-3045-4ec0-86fb-2ce15b5d1b92", "metadata": {}, "outputs": [], @@ -19,7 +19,7 @@ "import os\n", "\n", "# Typing includes.\n", - "from typing import Dict, List, Optional\n", + "from typing import Dict, List, Optional, Any\n", "\n", "# Numerical includes.\n", "import numpy as np\n", @@ -230,19 +230,10 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 98, "id": "b881e548-4d27-4725-8c47-b2612157929e", "metadata": {}, - "outputs": [ - { - "ename": "IndentationError", - "evalue": "expected an indented block (845818540.py, line 69)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m File \u001b[0;32m\"/var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_27199/845818540.py\"\u001b[0;36m, line \u001b[0;32m69\u001b[0m\n\u001b[0;31m def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n" - ] - } - ], + "outputs": [], "source": [ "class MIMIC3DatasetWrapper(MIMIC3Dataset):\n", " ''' Add extra tables to the MIMIC III dataset.\n", @@ -254,78 +245,170 @@ " The text data is generally joined on the PATIENTID, HADMID, ITEMID to match the\n", " pyHealth Vists class representation.\n", " '''\n", + " \n", + " # We need to add storage for text-based lookup tables here.\n", + " def __init__(self, *args, **kwargs):\n", + " self._valid_text_tables = [\"D_ICD_DIAGNOSES\", \"D_ITEMS\", \"D_ICD_PROCEDURES\", \"D_LABITEMS\"]\n", + " self._text_descriptions = {x: {} for x in self._valid_text_tables}\n", + " super().__init__(*args, **kwargs)\n", " \n", - " # Skip init and defer to base class.\n", + " def get_all_tables(self) -> List[str]: \n", + " return list(self._text_descriptions.keys())\n", + " \n", + " def get_text_dict(self, table_name: str) -> Dict[str, Dict[Any, Any]]:\n", + " return self._text_descriptions.get(table_name)\n", " \n", " # Note the name has to match the table name exactly.\n", " # See https://github.com/sunlabuiuc/PyHealth/blob/master/pyhealth/datasets/mimic3.py#L71.\n", " def parse_d_icd_diagnoses(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n", - " # TODO(botelho3) fill this in to join the text descriptions to the visit.\n", + " \"\"\"Helper function which parses D_ICD_DIAGNOSIS table.\n", + " Will be called in `self.parse_tables()`\n", + " Docs:\n", + " - D_ICD_DIAGNOSIS: https://mimic.mit.edu/docs/iii/tables/d_icd_diagnoses/\n", + " Args:\n", + " patients: a dict of `Patient` objects indexed by patient_id.\n", + " Returns:\n", + " The updated patients dict.\n", + " Note:\n", + " N/A\n", + " \"\"\"\n", + " table = \"D_ICD_DIAGNOSES\"\n", + " print(f\"Parsing {table}\")\n", + " assert(table in self._valid_text_tables)\n", + " \n", + " # read table\n", + " df = pd.read_csv(\n", + " os.path.join(self.root, f\"{table}.csv\"),\n", + " usecols=[\"ICD9_CODE\", \"SHORT_TITLE\", \"LONG_TITLE\"],\n", + " dtype={\"ICD9_CODE\": str, \"SHORT_TITLE\": str, \"LONG_TITLE\": str}\n", + " )\n", + " \n", + " # drop rows with missing values\n", + " df = df.dropna(subset=[\"ICD9_CODE\", \"SHORT_TITLE\", \"LONG_TITLE\"])\n", + " # sort by sequence number (i.e., priority)\n", + " df = df.sort_values([\"ICD9_CODE\"], ascending=True)\n", + " \n", + " # print(df.head())\n", + " self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')\n", + " \n", + " # We haven't altered the patients array, just return it.\n", " return patients\n", - "# \"\"\"Helper function which parses DIAGNOSES_ICD table.\n", - "# Will be called in `self.parse_tables()`\n", - "# Docs:\n", - "# - DIAGNOSES_ICD: https://mimic.mit.edu/docs/iii/tables/diagnoses_icd/\n", - "# Args:\n", - "# patients: a dict of `Patient` objects indexed by patient_id.\n", - "# Returns:\n", - "# The updated patients dict.\n", - "# Note:\n", - "# MIMIC-III does not provide specific timestamps in DIAGNOSES_ICD\n", - "# table, so we set it to None.\n", - "# \"\"\"\n", - "# table = \"DIAGNOSES_ICD\"\n", - "# # read table\n", - "# df = pd.read_csv(\n", - "# os.path.join(self.root, f\"{table}.csv\"),\n", - "# dtype={\"SUBJECT_ID\": str, \"HADM_ID\": str, \"ICD9_CODE\": str},\n", - "# )\n", - "# # drop records of the other patients\n", - "# df = df[df[\"SUBJECT_ID\"].isin(patients.keys())]\n", - "# # drop rows with missing values\n", - "# df = df.dropna(subset=[\"SUBJECT_ID\", \"HADM_ID\", \"ICD9_CODE\"])\n", - "# # sort by sequence number (i.e., priority)\n", - "# df = df.sort_values([\"SUBJECT_ID\", \"HADM_ID\", \"SEQ_NUM\"], ascending=True)\n", - "# # group by patient and visit\n", - "# group_df = df.groupby(\"SUBJECT_ID\")\n", - "\n", - "# # parallel unit of diagnosis (per patient)\n", - "# def diagnosis_unit(p_id, p_info):\n", - "# events = []\n", - "# for v_id, v_info in p_info.groupby(\"HADM_ID\"):\n", - "# for code in v_info[\"ICD9_CODE\"]:\n", - "# event = Event(\n", - "# code=code,\n", - "# table=table,\n", - "# vocabulary=\"ICD9CM\",\n", - "# visit_id=v_id,\n", - "# patient_id=p_id,\n", - "# )\n", - "# events.append(event)\n", - "# return events\n", - "\n", - "# # parallel apply\n", - "# group_df = group_df.parallel_apply(\n", - "# lambda x: diagnosis_unit(x.SUBJECT_ID.unique()[0], x)\n", - "# )\n", - "\n", - "# # summarize the results\n", - "# patients = self._add_events_to_patient_dict(patients, group_df)\n", - "# return patients\n", + " \n", + " def parse_d_labitems(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n", + " \"\"\"Helper function which parses D_LABITEMS table.\n", + " Will be called in `self.parse_tables()`\n", + " Docs:\n", + " - D_LABITEMS: https://mimic.mit.edu/docs/iii/tables/d_labitems/\n", + " Args:\n", + " patients: a dict of `Patient` objects indexed by patient_id.\n", + " Returns:\n", + " The updated patients dict.\n", + " Note:\n", + " N/A\n", + " \"\"\"\n", + " table = \"D_LABITEMS\"\n", + " print(f\"Parsing {table}\")\n", + " assert(table in self._valid_text_tables)\n", + " \n", + " # read table\n", + " df = pd.read_csv(\n", + " os.path.join(self.root, f\"{table}.csv\"),\n", + " usecols=[\"ITEMID\", \"LABEL\", \"CATEGORY\", \"FLUID\"],\n", + " dtype={\"ITEMID\": str, \"LABEL\": str, \"CATEGORY\": str, \"FLUID\": str}\n", + " )\n", + " \n", + " # drop rows with missing values\n", + " df = df.dropna(subset=[\"ITEMID\", \"LABEL\", \"CATEGORY\", \"FLUID\"])\n", + " # sort by sequence number (i.e., priority)\n", + " df = df.sort_values([\"ITEMID\"], ascending=True)\n", + " \n", + " self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')\n", + " \n", + " # We haven't altered the patients array, just return it.\n", + " return patients\n", + " \n", " \n", " def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n", - " # TODO(botelho3) fill this in to join the text descriptions to the visit.\n", + " # TODO(botelho3) - Note this may not be totally useable because the ITEMID\n", + " # uinqiue key only links to these tables using ITEMID\n", + " # - INPUTEVENTS_MV \n", + " # - OUTPUTEVENTS on ITEMID\n", + " # - PROCEDUREEVENTS_MV on ITEMID\n", + " # \n", + " # Not to the tables we want e.g. \n", + "\n", + " \"\"\"Helper function which parses D_ITEMS table.\n", + " Will be called in `self.parse_tables()`\n", + " Docs:\n", + " - D_ITEMS: https://mimic.mit.edu/docs/iii/tables/d_items/\n", + " Args:\n", + " patients: a dict of `Patient` objects indexed by patient_id.\n", + " Returns:\n", + " The updated patients dict.\n", + " Note:\n", + " N/A\n", + " \"\"\"\n", + " table = \"D_ITEMS\"\n", + " print(f\"Parsing {table}\")\n", + " assert(table in self._valid_text_tables)\n", + " \n", + " # read table\n", + " df = pd.read_csv(\n", + " os.path.join(self.root, f\"{table}.csv\"),\n", + " usecols=[\"ITEMID\", \"LABEL\", \"CATEGORY\"],\n", + " dtype={\"ITEMID\": str, \"LABEL\": str, \"CATEGORY\": str}\n", + " )\n", + " \n", + " # drop rows with missing values\n", + " df = df.dropna(subset=[\"ITEMID\", \"LABEL\", \"CATEGORY\"])\n", + " # sort by sequence number (i.e., priority)\n", + " df = df.sort_values([\"ITEMID\"], ascending=True)\n", + " \n", + " self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')\n", + " \n", + " # We haven't altered the patients array, just return it.\n", " return patients\n", " \n", + " \n", " def parse_d_icd_procedures(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n", - " # TODO(botelho3) fill this in to join the text descriptions to the visit.\n", + " \"\"\"Helper function which parses D_ICD_PROCEDURES table.\n", + " Will be called in `self.parse_tables()`\n", + " Docs:\n", + " - D_ICD_PROCEDURES: https://mimic.mit.edu/docs/iii/tables/d_icd_procedures/\n", + " Args:\n", + " patients: a dict of `Patient` objects indexed by patient_id.\n", + " Returns:\n", + " The updated patients dict.\n", + " Note:\n", + " N/A\n", + " \"\"\"\n", + " table = \"D_ICD_PROCEDURES\"\n", + " print(f\"Parsing {table}\")\n", + " assert(table in self._valid_text_tables)\n", + " \n", + " # read table\n", + " df = pd.read_csv(\n", + " os.path.join(self.root, f\"{table}.csv\"),\n", + " usecols=[\"ICD9_CODE\", \"SHORT_TITLE\", \"LONG_TITLE\"],\n", + " dtype={\"ICD9_CODE\": str, \"SHORT_TITLE\": str, \"LONG_TITLE\": str}\n", + " )\n", + " \n", + " # drop rows with missing values\n", + " df = df.dropna(subset=[\"ICD9_CODE\", \"SHORT_TITLE\", \"LONG_TITLE\"])\n", + " # sort by sequence number (i.e., priority)\n", + " df = df.sort_values([\"ICD9_CODE\"], ascending=True)\n", + " \n", + " # print(df.head())\n", + " self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')\n", + " \n", + " # We haven't altered the patients array, just return it.\n", " return patients\n", " " ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 96, "id": "427a42a6-441e-438c-96f4-b38ba82fd192", "metadata": {}, "outputs": [ @@ -333,11 +416,28 @@ "name": "stderr", "output_type": "stream", "text": [ - "Parsing PATIENTS and ADMISSIONS: 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 540.68it/s]\n", - "Parsing PROCEDURES_ICD: 100%|███████████████████████████████████████████████████████| 113/113 [00:00<00:00, 5934.54it/s]\n", - "Parsing PRESCRIPTIONS: 100%|██████████████████████████████████████████████████████████| 122/122 [00:01<00:00, 99.08it/s]\n", - "Parsing LABEVENTS: 100%|██████████████████████████████████████████████████████████████| 129/129 [00:06<00:00, 21.36it/s]\n", - "Mapping codes: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 214.50it/s]\n" + "Parsing PATIENTS and ADMISSIONS: 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 616.21it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsing D_ICD_DIAGNOSES\n", + "Parsing D_ICD_PROCEDURES\n", + "Parsing D_ITEMS\n", + "Parsing D_LABITEMS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Parsing DIAGNOSES_ICD: 100%|████████████████████████████████████████████████████████| 129/129 [00:00<00:00, 5045.37it/s]\n", + "Parsing PROCEDURES_ICD: 100%|███████████████████████████████████████████████████████| 113/113 [00:00<00:00, 6525.72it/s]\n", + "Parsing PRESCRIPTIONS: 100%|█████████████████████████████████████████████████████████| 122/122 [00:00<00:00, 136.63it/s]\n", + "Parsing LABEVENTS: 100%|██████████████████████████████████████████████████████████████| 129/129 [00:05<00:00, 25.78it/s]\n", + "Mapping codes: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 244.02it/s]\n" ] }, { @@ -350,8 +450,11 @@ "\t- Number of patients: 100\n", "\t- Number of visits: 129\n", "\t- Number of visits per patient: 1.2900\n", - "\t- Number of events per visit in D_ITEMS: 0.0000\n", + "\t- Number of events per visit in D_ICD_DIAGNOSES: 0.0000\n", "\t- Number of events per visit in D_ICD_PROCEDURES: 0.0000\n", + "\t- Number of events per visit in D_ITEMS: 0.0000\n", + "\t- Number of events per visit in D_LABITEMS: 0.0000\n", + "\t- Number of events per visit in DIAGNOSES_ICD: 13.6512\n", "\t- Number of events per visit in PROCEDURES_ICD: 3.9225\n", "\t- Number of events per visit in PRESCRIPTIONS: 115.6667\n", "\t- Number of events per visit in LABEVENTS: 479.1628\n", @@ -378,16 +481,64 @@ "mimic3base = MIMIC3DatasetWrapper(\n", " #root=\"https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III/\",\n", " root=os.path.join(os.getcwd(), DATA_DIR_),\n", - " tables=[\"D_ITEMS\", \"D_ICD_PROCEDURES\", \"PROCEDURES_ICD\", \"PRESCRIPTIONS\", \"LABEVENTS\",], # \"D_ICD_DIAGNOSES\", \"DIAGNOSES_ICD\"\n", + " tables=[\"D_ICD_DIAGNOSES\", \"D_ICD_PROCEDURES\", \"D_ITEMS\", \"D_LABITEMS\",\n", + " \"DIAGNOSES_ICD\", \"PROCEDURES_ICD\", \"PRESCRIPTIONS\", \"LABEVENTS\"],\n", " # map all NDC codes to ATC 3-rd level codes in these tables\n", " # See https://en.wikipedia.org/wiki/Anatomical_Therapeutic_Chemical_Classification_System.\n", " code_mapping={\"NDC\": (\"ATC\", {\"target_kwargs\": {\"level\": 3}})},\n", + " # Slow\n", + " refresh_cache=True,\n", ")\n", "\n", "mimic3base.stat()\n", "mimic3base.info()\n" ] }, + { + "cell_type": "code", + "execution_count": 100, + "id": "4cef4476-6bc8-4791-b246-4c329a80a2e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dict_keys(['D_ICD_DIAGNOSES', 'D_ITEMS', 'D_ICD_PROCEDURES', 'D_LABITEMS'])\n", + "Table: D_ICD_DIAGNOSES\n", + "[['0010', 'Cholera d/t vib cholerae', 'Cholera due to vibrio cholerae'], ['0011', 'Cholera d/t vib el tor', 'Cholera due to vibrio cholerae el tor'], ['0019', 'Cholera NOS', 'Cholera, unspecified'], ['0020', 'Typhoid fever', 'Typhoid fever'], ['0021', 'Paratyphoid fever a', 'Paratyphoid fever A']]\n", + "\n", + "\n", + "\n", + "Table: D_ITEMS\n", + "[['1126', 'Art.pH', 'ABG'], ['1127', 'WBC (4-11,000)', 'Hematology'], ['1520', 'ACT', 'Coags'], ['1521', 'Albumin', 'Chemistry'], ['1522', 'Calcium', 'Chemistry']]\n", + "\n", + "\n", + "\n", + "Table: D_ICD_PROCEDURES\n", + "[['0001', 'Ther ult head & neck ves', 'Therapeutic ultrasound of vessels of head and neck'], ['0002', 'Ther ultrasound of heart', 'Therapeutic ultrasound of heart'], ['0003', 'Ther ult peripheral ves', 'Therapeutic ultrasound of peripheral vascular vessels'], ['0009', 'Other therapeutic ultsnd', 'Other therapeutic ultrasound'], ['0010', 'Implant chemothera agent', 'Implantation of chemotherapeutic agent']]\n", + "\n", + "\n", + "\n", + "Table: D_LABITEMS\n", + "[['50800', 'SPECIMEN TYPE', 'BLOOD', 'BLOOD GAS'], ['50801', 'Alveolar-arterial Gradient', 'Blood', 'Blood Gas'], ['50802', 'Base Excess', 'Blood', 'Blood Gas'], ['50803', 'Calculated Bicarbonate, Whole Blood', 'Blood', 'Blood Gas'], ['50804', 'Calculated Total CO2', 'Blood', 'Blood Gas']]\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "table_names = mimic3base.get_all_tables()\n", + "print(table_names)\n", + "\n", + "for t in table_names:\n", + " d = mimic3base.get_text_dict(t)\n", + " print(f\"Table: {t}\")\n", + " print(d['data'][:5])\n", + " print('\\n\\n')\n" + ] + }, { "cell_type": "markdown", "id": "b02b8e28-23ac-4ac1-a9b6-ae546e28c905",