Update MIMIC3DatasetWrapper to parse D_* tables

The data from the tables is stored in self._text_descriptions as dictionaries. TBD The dictionaries can be used by used by later pyHealth tasks e.g. `readmission_pred_task()` to combine the text data for ICD9 codes in the sample tuple.

Update MIMIC3DatasetWrapper to parse D_* tables
25d4d101 · aBotel · 2a76a5e2 · 25d4d101
Commit 25d4d101 authored 2 years ago by aBotel
--- a/src/cs598_dlh_final_project.ipynb
+++ b/src/cs598_dlh_final_project.ipynb
@@ -10,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 38,
   "id": "5da368cd-3045-4ec0-86fb-2ce15b5d1b92",
   "metadata": {},
   "outputs": [],
@@ -19,7 +19,7 @@
    "import os\n",
    "\n",
    "# Typing includes.\n",
-    "from typing import Dict, List, Optional\n",
+    "from typing import Dict, List, Optional, Any\n",
    "\n",
    "# Numerical includes.\n",
    "import numpy as np\n",
@@ -230,19 +230,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 98,
   "id": "b881e548-4d27-4725-8c47-b2612157929e",
   "metadata": {},
-   "outputs": [
-    {
-     "ename": "IndentationError",
-     "evalue": "expected an indented block (845818540.py, line 69)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;36m  File \u001b[0;32m\"/var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_27199/845818540.py\"\u001b[0;36m, line \u001b[0;32m69\u001b[0m\n\u001b[0;31m    def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:\u001b[0m\n\u001b[0m      ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "class MIMIC3DatasetWrapper(MIMIC3Dataset):\n",
    "    ''' Add extra tables to the MIMIC III dataset.\n",
@@ -254,78 +245,170 @@
    "      The text data is generally joined on the PATIENTID, HADMID, ITEMID to match the\n",
    "      pyHealth Vists class representation.\n",
    "    '''\n",
+    "   \n",
+    "    # We need to add storage for text-based lookup tables here.\n",
+    "    def __init__(self, *args, **kwargs):\n",
+    "        self._valid_text_tables = [\"D_ICD_DIAGNOSES\", \"D_ITEMS\", \"D_ICD_PROCEDURES\", \"D_LABITEMS\"]\n",
+    "        self._text_descriptions = {x: {} for x in self._valid_text_tables}\n",
+    "        super().__init__(*args, **kwargs)\n",
    "    \n",
-    "    # Skip init and defer to base class.\n",
+    "    def get_all_tables(self) -> List[str]: \n",
+    "        return list(self._text_descriptions.keys())\n",
+    "        \n",
+    "    def get_text_dict(self, table_name: str) -> Dict[str, Dict[Any, Any]]:\n",
+    "        return self._text_descriptions.get(table_name)\n",
    "    \n",
    "    # Note the name has to match the table name exactly.\n",
    "    # See https://github.com/sunlabuiuc/PyHealth/blob/master/pyhealth/datasets/mimic3.py#L71.\n",
    "    def parse_d_icd_diagnoses(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n",
-    "        # TODO(botelho3) fill this in to join the text descriptions to the visit.\n",
+    "        \"\"\"Helper function which parses D_ICD_DIAGNOSIS table.\n",
+    "        Will be called in `self.parse_tables()`\n",
+    "        Docs:\n",
+    "            - D_ICD_DIAGNOSIS: https://mimic.mit.edu/docs/iii/tables/d_icd_diagnoses/\n",
+    "        Args:\n",
+    "            patients: a dict of `Patient` objects indexed by patient_id.\n",
+    "        Returns:\n",
+    "            The updated patients dict.\n",
+    "        Note:\n",
+    "            N/A\n",
+    "        \"\"\"\n",
+    "        table = \"D_ICD_DIAGNOSES\"\n",
+    "        print(f\"Parsing {table}\")\n",
+    "        assert(table in self._valid_text_tables)\n",
+    "        \n",
+    "        # read table\n",
+    "        df = pd.read_csv(\n",
+    "            os.path.join(self.root, f\"{table}.csv\"),\n",
+    "            usecols=[\"ICD9_CODE\", \"SHORT_TITLE\", \"LONG_TITLE\"],\n",
+    "            dtype={\"ICD9_CODE\": str, \"SHORT_TITLE\": str, \"LONG_TITLE\": str}\n",
+    "        )\n",
+    "        \n",
+    "        # drop rows with missing values\n",
+    "        df = df.dropna(subset=[\"ICD9_CODE\", \"SHORT_TITLE\", \"LONG_TITLE\"])\n",
+    "        # sort by sequence number (i.e., priority)\n",
+    "        df = df.sort_values([\"ICD9_CODE\"], ascending=True)\n",
+    "       \n",
+    "        # print(df.head())\n",
+    "        self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')\n",
+    "        \n",
+    "        # We haven't altered the patients array, just return it.\n",
    "        return patients\n",
-    "#                 \"\"\"Helper function which parses DIAGNOSES_ICD table.\n",
-    "#         Will be called in `self.parse_tables()`\n",
-    "#         Docs:\n",
-    "#             - DIAGNOSES_ICD: https://mimic.mit.edu/docs/iii/tables/diagnoses_icd/\n",
-    "#         Args:\n",
-    "#             patients: a dict of `Patient` objects indexed by patient_id.\n",
-    "#         Returns:\n",
-    "#             The updated patients dict.\n",
-    "#         Note:\n",
-    "#             MIMIC-III does not provide specific timestamps in DIAGNOSES_ICD\n",
-    "#                 table, so we set it to None.\n",
-    "#         \"\"\"\n",
-    "#         table = \"DIAGNOSES_ICD\"\n",
-    "#         # read table\n",
-    "#         df = pd.read_csv(\n",
-    "#             os.path.join(self.root, f\"{table}.csv\"),\n",
-    "#             dtype={\"SUBJECT_ID\": str, \"HADM_ID\": str, \"ICD9_CODE\": str},\n",
-    "#         )\n",
-    "#         # drop records of the other patients\n",
-    "#         df = df[df[\"SUBJECT_ID\"].isin(patients.keys())]\n",
-    "#         # drop rows with missing values\n",
-    "#         df = df.dropna(subset=[\"SUBJECT_ID\", \"HADM_ID\", \"ICD9_CODE\"])\n",
-    "#         # sort by sequence number (i.e., priority)\n",
-    "#         df = df.sort_values([\"SUBJECT_ID\", \"HADM_ID\", \"SEQ_NUM\"], ascending=True)\n",
-    "#         # group by patient and visit\n",
-    "#         group_df = df.groupby(\"SUBJECT_ID\")\n",
-    "\n",
-    "#         # parallel unit of diagnosis (per patient)\n",
-    "#         def diagnosis_unit(p_id, p_info):\n",
-    "#             events = []\n",
-    "#             for v_id, v_info in p_info.groupby(\"HADM_ID\"):\n",
-    "#                 for code in v_info[\"ICD9_CODE\"]:\n",
-    "#                     event = Event(\n",
-    "#                         code=code,\n",
-    "#                         table=table,\n",
-    "#                         vocabulary=\"ICD9CM\",\n",
-    "#                         visit_id=v_id,\n",
-    "#                         patient_id=p_id,\n",
-    "#                     )\n",
-    "#                     events.append(event)\n",
-    "#             return events\n",
-    "\n",
-    "#         # parallel apply\n",
-    "#         group_df = group_df.parallel_apply(\n",
-    "#             lambda x: diagnosis_unit(x.SUBJECT_ID.unique()[0], x)\n",
-    "#         )\n",
-    "\n",
-    "#         # summarize the results\n",
-    "#         patients = self._add_events_to_patient_dict(patients, group_df)\n",
-    "#         return patients\n",
+    "    \n",
+    "    def parse_d_labitems(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n",
+    "        \"\"\"Helper function which parses D_LABITEMS table.\n",
+    "        Will be called in `self.parse_tables()`\n",
+    "        Docs:\n",
+    "            - D_LABITEMS: https://mimic.mit.edu/docs/iii/tables/d_labitems/\n",
+    "        Args:\n",
+    "            patients: a dict of `Patient` objects indexed by patient_id.\n",
+    "        Returns:\n",
+    "            The updated patients dict.\n",
+    "        Note:\n",
+    "            N/A\n",
+    "        \"\"\"\n",
+    "        table = \"D_LABITEMS\"\n",
+    "        print(f\"Parsing {table}\")\n",
+    "        assert(table in self._valid_text_tables)\n",
+    "        \n",
+    "        # read table\n",
+    "        df = pd.read_csv(\n",
+    "            os.path.join(self.root, f\"{table}.csv\"),\n",
+    "            usecols=[\"ITEMID\", \"LABEL\", \"CATEGORY\", \"FLUID\"],\n",
+    "            dtype={\"ITEMID\": str, \"LABEL\": str, \"CATEGORY\": str, \"FLUID\": str}\n",
+    "        )\n",
+    "        \n",
+    "        # drop rows with missing values\n",
+    "        df = df.dropna(subset=[\"ITEMID\", \"LABEL\", \"CATEGORY\", \"FLUID\"])\n",
+    "        # sort by sequence number (i.e., priority)\n",
+    "        df = df.sort_values([\"ITEMID\"], ascending=True)\n",
+    "       \n",
+    "        self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')\n",
+    "        \n",
+    "        # We haven't altered the patients array, just return it.\n",
+    "        return patients\n",
+    "    \n",
    "    \n",
    "    def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n",
-    "        # TODO(botelho3) fill this in to join the text descriptions to the visit.\n",
+    "        # TODO(botelho3) - Note this may not be totally useable because the ITEMID\n",
+    "        # uinqiue key only links to these tables using ITEMID\n",
+    "        #   - INPUTEVENTS_MV \n",
+    "        #   - OUTPUTEVENTS on ITEMID\n",
+    "        #   - PROCEDUREEVENTS_MV on ITEMID\n",
+    "        # \n",
+    "        # Not to the tables we want e.g. \n",
+    "\n",
+    "        \"\"\"Helper function which parses D_ITEMS table.\n",
+    "        Will be called in `self.parse_tables()`\n",
+    "        Docs:\n",
+    "            - D_ITEMS: https://mimic.mit.edu/docs/iii/tables/d_items/\n",
+    "        Args:\n",
+    "            patients: a dict of `Patient` objects indexed by patient_id.\n",
+    "        Returns:\n",
+    "            The updated patients dict.\n",
+    "        Note:\n",
+    "            N/A\n",
+    "        \"\"\"\n",
+    "        table = \"D_ITEMS\"\n",
+    "        print(f\"Parsing {table}\")\n",
+    "        assert(table in self._valid_text_tables)\n",
+    "        \n",
+    "        # read table\n",
+    "        df = pd.read_csv(\n",
+    "            os.path.join(self.root, f\"{table}.csv\"),\n",
+    "            usecols=[\"ITEMID\", \"LABEL\", \"CATEGORY\"],\n",
+    "            dtype={\"ITEMID\": str, \"LABEL\": str, \"CATEGORY\": str}\n",
+    "        )\n",
+    "        \n",
+    "        # drop rows with missing values\n",
+    "        df = df.dropna(subset=[\"ITEMID\", \"LABEL\", \"CATEGORY\"])\n",
+    "        # sort by sequence number (i.e., priority)\n",
+    "        df = df.sort_values([\"ITEMID\"], ascending=True)\n",
+    "       \n",
+    "        self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')\n",
+    "        \n",
+    "        # We haven't altered the patients array, just return it.\n",
    "        return patients\n",
    "    \n",
+    "    \n",
    "    def parse_d_icd_procedures(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n",
-    "        # TODO(botelho3) fill this in to join the text descriptions to the visit.\n",
+    "        \"\"\"Helper function which parses D_ICD_PROCEDURES table.\n",
+    "        Will be called in `self.parse_tables()`\n",
+    "        Docs:\n",
+    "            - D_ICD_PROCEDURES: https://mimic.mit.edu/docs/iii/tables/d_icd_procedures/\n",
+    "        Args:\n",
+    "            patients: a dict of `Patient` objects indexed by patient_id.\n",
+    "        Returns:\n",
+    "            The updated patients dict.\n",
+    "        Note:\n",
+    "            N/A\n",
+    "        \"\"\"\n",
+    "        table = \"D_ICD_PROCEDURES\"\n",
+    "        print(f\"Parsing {table}\")\n",
+    "        assert(table in self._valid_text_tables)\n",
+    "        \n",
+    "        # read table\n",
+    "        df = pd.read_csv(\n",
+    "            os.path.join(self.root, f\"{table}.csv\"),\n",
+    "            usecols=[\"ICD9_CODE\", \"SHORT_TITLE\", \"LONG_TITLE\"],\n",
+    "            dtype={\"ICD9_CODE\": str, \"SHORT_TITLE\": str, \"LONG_TITLE\": str}\n",
+    "        )\n",
+    "        \n",
+    "        # drop rows with missing values\n",
+    "        df = df.dropna(subset=[\"ICD9_CODE\", \"SHORT_TITLE\", \"LONG_TITLE\"])\n",
+    "        # sort by sequence number (i.e., priority)\n",
+    "        df = df.sort_values([\"ICD9_CODE\"], ascending=True)\n",
+    "       \n",
+    "        # print(df.head())\n",
+    "        self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')\n",
+    "        \n",
+    "        # We haven't altered the patients array, just return it.\n",
    "        return patients\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 96,
   "id": "427a42a6-441e-438c-96f4-b38ba82fd192",
   "metadata": {},
   "outputs": [
@@ -333,11 +416,28 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Parsing PATIENTS and ADMISSIONS: 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 540.68it/s]\n",
-      "Parsing PROCEDURES_ICD: 100%|███████████████████████████████████████████████████████| 113/113 [00:00<00:00, 5934.54it/s]\n",
-      "Parsing PRESCRIPTIONS: 100%|██████████████████████████████████████████████████████████| 122/122 [00:01<00:00, 99.08it/s]\n",
-      "Parsing LABEVENTS: 100%|██████████████████████████████████████████████████████████████| 129/129 [00:06<00:00, 21.36it/s]\n",
-      "Mapping codes: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 214.50it/s]\n"
+      "Parsing PATIENTS and ADMISSIONS: 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 616.21it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parsing D_ICD_DIAGNOSES\n",
+      "Parsing D_ICD_PROCEDURES\n",
+      "Parsing D_ITEMS\n",
+      "Parsing D_LABITEMS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parsing DIAGNOSES_ICD: 100%|████████████████████████████████████████████████████████| 129/129 [00:00<00:00, 5045.37it/s]\n",
+      "Parsing PROCEDURES_ICD: 100%|███████████████████████████████████████████████████████| 113/113 [00:00<00:00, 6525.72it/s]\n",
+      "Parsing PRESCRIPTIONS: 100%|█████████████████████████████████████████████████████████| 122/122 [00:00<00:00, 136.63it/s]\n",
+      "Parsing LABEVENTS: 100%|██████████████████████████████████████████████████████████████| 129/129 [00:05<00:00, 25.78it/s]\n",
+      "Mapping codes: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 244.02it/s]\n"
     ]
    },
    {
@@ -350,8 +450,11 @@
      "\t- Number of patients: 100\n",
      "\t- Number of visits: 129\n",
      "\t- Number of visits per patient: 1.2900\n",
-      "\t- Number of events per visit in D_ITEMS: 0.0000\n",
+      "\t- Number of events per visit in D_ICD_DIAGNOSES: 0.0000\n",
      "\t- Number of events per visit in D_ICD_PROCEDURES: 0.0000\n",
+      "\t- Number of events per visit in D_ITEMS: 0.0000\n",
+      "\t- Number of events per visit in D_LABITEMS: 0.0000\n",
+      "\t- Number of events per visit in DIAGNOSES_ICD: 13.6512\n",
      "\t- Number of events per visit in PROCEDURES_ICD: 3.9225\n",
      "\t- Number of events per visit in PRESCRIPTIONS: 115.6667\n",
      "\t- Number of events per visit in LABEVENTS: 479.1628\n",
@@ -378,16 +481,64 @@
    "mimic3base = MIMIC3DatasetWrapper(\n",
    "    #root=\"https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III/\",\n",
    "    root=os.path.join(os.getcwd(), DATA_DIR_),\n",
-    "    tables=[\"D_ITEMS\", \"D_ICD_PROCEDURES\", \"PROCEDURES_ICD\", \"PRESCRIPTIONS\", \"LABEVENTS\",], # \"D_ICD_DIAGNOSES\", \"DIAGNOSES_ICD\"\n",
+    "    tables=[\"D_ICD_DIAGNOSES\", \"D_ICD_PROCEDURES\", \"D_ITEMS\", \"D_LABITEMS\",\n",
+    "            \"DIAGNOSES_ICD\", \"PROCEDURES_ICD\", \"PRESCRIPTIONS\", \"LABEVENTS\"],\n",
    "    # map all NDC codes to ATC 3-rd level codes in these tables\n",
    "    # See https://en.wikipedia.org/wiki/Anatomical_Therapeutic_Chemical_Classification_System.\n",
    "    code_mapping={\"NDC\": (\"ATC\", {\"target_kwargs\": {\"level\": 3}})},\n",
+    "    # Slow\n",
+    "    refresh_cache=True,\n",
    ")\n",
    "\n",
    "mimic3base.stat()\n",
    "mimic3base.info()\n"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "id": "4cef4476-6bc8-4791-b246-4c329a80a2e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_keys(['D_ICD_DIAGNOSES', 'D_ITEMS', 'D_ICD_PROCEDURES', 'D_LABITEMS'])\n",
+      "Table: D_ICD_DIAGNOSES\n",
+      "[['0010', 'Cholera d/t vib cholerae', 'Cholera due to vibrio cholerae'], ['0011', 'Cholera d/t vib el tor', 'Cholera due to vibrio cholerae el tor'], ['0019', 'Cholera NOS', 'Cholera, unspecified'], ['0020', 'Typhoid fever', 'Typhoid fever'], ['0021', 'Paratyphoid fever a', 'Paratyphoid fever A']]\n",
+      "\n",
+      "\n",
+      "\n",
+      "Table: D_ITEMS\n",
+      "[['1126', 'Art.pH', 'ABG'], ['1127', 'WBC   (4-11,000)', 'Hematology'], ['1520', 'ACT', 'Coags'], ['1521', 'Albumin', 'Chemistry'], ['1522', 'Calcium', 'Chemistry']]\n",
+      "\n",
+      "\n",
+      "\n",
+      "Table: D_ICD_PROCEDURES\n",
+      "[['0001', 'Ther ult head & neck ves', 'Therapeutic ultrasound of vessels of head and neck'], ['0002', 'Ther ultrasound of heart', 'Therapeutic ultrasound of heart'], ['0003', 'Ther ult peripheral ves', 'Therapeutic ultrasound of peripheral vascular vessels'], ['0009', 'Other therapeutic ultsnd', 'Other therapeutic ultrasound'], ['0010', 'Implant chemothera agent', 'Implantation of chemotherapeutic agent']]\n",
+      "\n",
+      "\n",
+      "\n",
+      "Table: D_LABITEMS\n",
+      "[['50800', 'SPECIMEN TYPE', 'BLOOD', 'BLOOD GAS'], ['50801', 'Alveolar-arterial Gradient', 'Blood', 'Blood Gas'], ['50802', 'Base Excess', 'Blood', 'Blood Gas'], ['50803', 'Calculated Bicarbonate, Whole Blood', 'Blood', 'Blood Gas'], ['50804', 'Calculated Total CO2', 'Blood', 'Blood Gas']]\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_names = mimic3base.get_all_tables()\n",
+    "print(table_names)\n",
+    "\n",
+    "for t in table_names:\n",
+    "    d = mimic3base.get_text_dict(t)\n",
+    "    print(f\"Table: {t}\")\n",
+    "    print(d['data'][:5])\n",
+    "    print('\\n\\n')\n"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "b02b8e28-23ac-4ac1-a9b6-ae546e28c905",

 %% Cell type:markdown id:7ccba6af-1867-491d-b9d0-d99965738c64 tags:

 # Imports

 %% Cell type:code id:5da368cd-3045-4ec0-86fb-2ce15b5d1b92 tags:

 ``` python
 # General includes.
 import os

 # Typing includes.
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any

 # Numerical includes.
 import numpy as np
 import pandas as pd
 import torch
 from torch import nn

 # pyHealth includes.
 from pyhealth.datasets import MIMIC3Dataset
 from pyhealth.data import Patient, Visit, Event
 ```

 %% Cell type:code id:95e714b1-ca5d-4d7b-9ace-6b4372adfd9f tags:

 ``` python
 # Model imports
 from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
 ```

 %% Cell type:markdown id:cf83fa50-9dd0-467b-bd36-634795e4a09c tags:

 # Globals

 %% Cell type:code id:ebef3528-0396-459a-8112-b272089f5d67 tags:

 ``` python
 USE_GPU_ = False
 GPU_STRING_ = 'cuda'
 DATA_DIR_ = '../data_input_path/mimic'
 ```

 %% Cell type:markdown id:df0f8271-44b2-44a2-b7cc-cd7339a70e87 tags:

 # Preprocessing

 %% Cell type:code id:02624f53-6cee-4db0-8b15-e08b755a04f2 tags:

 ``` python
 ```

 %% Cell type:markdown id:ebe2c2d9-c161-4b9d-8182-86384faea929 tags:

 ### Load BERT Model

 %% Cell type:markdown id:39b0edec-1570-4640-9926-d908b456f957 tags:

 See instructions here:
 - https://pypi.org/project/pytorch-pretrained-bert/#examples
 - https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
 - Could also get it from pytorch transformers library: https://pytorch.org/hub/huggingface_pytorch-transformers/

 %% Cell type:code id:f40665f4-db92-4567-b0f9-001bd35e9284 tags:

 ``` python

 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


 # TODO(botelho3)`bert-base-uncased` is big. Load `bert-tiny` instead from the filesystem?
 # Model available at https://huggingface.co/prajjwal1/bert-tiny.
 # model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)

 # Tokenized input
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
 tokenized_text = tokenizer.tokenize(text)

 # Mask a token that we will try to predict back with `BertForMaskedLM`
 masked_index = 8
 tokenized_text[masked_index] = '[MASK]'
 assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

 # Convert token to vocabulary indices
 indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
 # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
 segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

 # Convert inputs to PyTorch tensors
 tokens_tensor = torch.tensor([indexed_tokens])
 segments_tensors = torch.tensor([segments_ids])
 ```

 %% Output

    100%|█████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 877100.29B/s]

 %% Cell type:code id:a7539b72-a871-4d52-947b-075b5c4be2ae tags:

 ``` python
 ```

 %% Cell type:code id:de132943-2c3e-405b-b2f8-5d348472672c tags:

 ``` python
 # Load pre-trained model (weights)
 model = BertForMaskedLM.from_pretrained('bert-base-uncased')
 model.eval()

 # If you have a GPU, put everything on cuda
 if USE_GPU_:
    tokens_tensor = tokens_tensor.to(GPU_STRING_)
    segments_tensors = segments_tensors.to(GPU_STRING_)
    model.to(GPU_STRING_)

 # Predict all tokens
 with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)

 # confirm we were able to predict 'henson'
 predicted_index = torch.argmax(predictions[0, masked_index]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == 'henson'
 ```

 %% Output

    ---------------------------------------------------------------------------
    TypeError                                 Traceback (most recent call last)
    /var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_23031/2203343201.py in <module>
         14
         15 # confirm we were able to predict 'henson'
    ---> 16 predicted_index = torch.argmax(predictions[0, masked_index]).item()
         17 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
         18 assert predicted_token == 'henson'
    TypeError: tuple indices must be integers or slices, not tuple

 %% Cell type:code id:02efb18a-626d-42d4-b19d-ba4b8f752a00 tags:

 ``` python
 ```

 %% Cell type:code id:0e2d58de-bf9a-4b57-977f-6aa3ac75e84f tags:

 ``` python
 # https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
 class BERTClassification(nn.Module):
    def __init__ (self):
        super(BERTClassification, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.bert_drop = nn.Dropout(0.4)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, pooledOut = self.bert(ids, attention_mask = mask,
                                token_type_ids=token_type_ids)
        bertOut = self.bert_drop(pooledOut)
        output = self.out(bertOut)

        return output
 ```

 %% Cell type:markdown id:8f0d5fa9-226e-4612-b28f-ed24de16399a tags:

 ### Load MIMIC III Data

 %% Cell type:code id:b881e548-4d27-4725-8c47-b2612157929e tags:

 ``` python
 class MIMIC3DatasetWrapper(MIMIC3Dataset):
    ''' Add extra tables to the MIMIC III dataset.

      Some of the tables we need like "D_ICD_DIAGNOSES", "D_ITEMS", "D_ICD_PROCEDURES"
      are not supported out of the box.

      This class defines parsing methods to extract text data from these extra tables.
      The text data is generally joined on the PATIENTID, HADMID, ITEMID to match the
      pyHealth Vists class representation.
    '''

-    # Skip init and defer to base class.
+    # We need to add storage for text-based lookup tables here.
+    def __init__(self, *args, **kwargs):
+        self._valid_text_tables = ["D_ICD_DIAGNOSES", "D_ITEMS", "D_ICD_PROCEDURES", "D_LABITEMS"]
+        self._text_descriptions = {x: {} for x in self._valid_text_tables}
+        super().__init__(*args, **kwargs)
+
+    def get_all_tables(self) -> List[str]:
+        return list(self._text_descriptions.keys())
+
+    def get_text_dict(self, table_name: str) -> Dict[str, Dict[Any, Any]]:
+        return self._text_descriptions.get(table_name)

    # Note the name has to match the table name exactly.
    # See https://github.com/sunlabuiuc/PyHealth/blob/master/pyhealth/datasets/mimic3.py#L71.
    def parse_d_icd_diagnoses(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
-        # TODO(botelho3) fill this in to join the text descriptions to the visit.
+        """Helper function which parses D_ICD_DIAGNOSIS table.
+        Will be called in `self.parse_tables()`
+        Docs:
+            - D_ICD_DIAGNOSIS: https://mimic.mit.edu/docs/iii/tables/d_icd_diagnoses/
+        Args:
+            patients: a dict of `Patient` objects indexed by patient_id.
+        Returns:
+            The updated patients dict.
+        Note:
+            N/A
+        """
+        table = "D_ICD_DIAGNOSES"
+        print(f"Parsing {table}")
+        assert(table in self._valid_text_tables)
+
+        # read table
+        df = pd.read_csv(
+            os.path.join(self.root, f"{table}.csv"),
+            usecols=["ICD9_CODE", "SHORT_TITLE", "LONG_TITLE"],
+            dtype={"ICD9_CODE": str, "SHORT_TITLE": str, "LONG_TITLE": str}
+        )
+
+        # drop rows with missing values
+        df = df.dropna(subset=["ICD9_CODE", "SHORT_TITLE", "LONG_TITLE"])
+        # sort by sequence number (i.e., priority)
+        df = df.sort_values(["ICD9_CODE"], ascending=True)
+
+        # print(df.head())
+        self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')
+
+        # We haven't altered the patients array, just return it.
+        return patients
+
+    def parse_d_labitems(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
+        """Helper function which parses D_LABITEMS table.
+        Will be called in `self.parse_tables()`
+        Docs:
+            - D_LABITEMS: https://mimic.mit.edu/docs/iii/tables/d_labitems/
+        Args:
+            patients: a dict of `Patient` objects indexed by patient_id.
+        Returns:
+            The updated patients dict.
+        Note:
+            N/A
+        """
+        table = "D_LABITEMS"
+        print(f"Parsing {table}")
+        assert(table in self._valid_text_tables)
+
+        # read table
+        df = pd.read_csv(
+            os.path.join(self.root, f"{table}.csv"),
+            usecols=["ITEMID", "LABEL", "CATEGORY", "FLUID"],
+            dtype={"ITEMID": str, "LABEL": str, "CATEGORY": str, "FLUID": str}
+        )
+
+        # drop rows with missing values
+        df = df.dropna(subset=["ITEMID", "LABEL", "CATEGORY", "FLUID"])
+        # sort by sequence number (i.e., priority)
+        df = df.sort_values(["ITEMID"], ascending=True)
+
+        self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')
+
+        # We haven't altered the patients array, just return it.
        return patients
-#                 """Helper function which parses DIAGNOSES_ICD table.
-#         Will be called in `self.parse_tables()`
-#         Docs:
-#             - DIAGNOSES_ICD: https://mimic.mit.edu/docs/iii/tables/diagnoses_icd/
-#         Args:
-#             patients: a dict of `Patient` objects indexed by patient_id.
-#         Returns:
-#             The updated patients dict.
-#         Note:
-#             MIMIC-III does not provide specific timestamps in DIAGNOSES_ICD
-#                 table, so we set it to None.
-#         """
-#         table = "DIAGNOSES_ICD"
-#         # read table
-#         df = pd.read_csv(
-#             os.path.join(self.root, f"{table}.csv"),
-#             dtype={"SUBJECT_ID": str, "HADM_ID": str, "ICD9_CODE": str},
-#         )
-#         # drop records of the other patients
-#         df = df[df["SUBJECT_ID"].isin(patients.keys())]
-#         # drop rows with missing values
-#         df = df.dropna(subset=["SUBJECT_ID", "HADM_ID", "ICD9_CODE"])
-#         # sort by sequence number (i.e., priority)
-#         df = df.sort_values(["SUBJECT_ID", "HADM_ID", "SEQ_NUM"], ascending=True)
-#         # group by patient and visit
-#         group_df = df.groupby("SUBJECT_ID")
-
-#         # parallel unit of diagnosis (per patient)
-#         def diagnosis_unit(p_id, p_info):
-#             events = []
-#             for v_id, v_info in p_info.groupby("HADM_ID"):
-#                 for code in v_info["ICD9_CODE"]:
-#                     event = Event(
-#                         code=code,
-#                         table=table,
-#                         vocabulary="ICD9CM",
-#                         visit_id=v_id,
-#                         patient_id=p_id,
-#                     )
-#                     events.append(event)
-#             return events
-
-#         # parallel apply
-#         group_df = group_df.parallel_apply(
-#             lambda x: diagnosis_unit(x.SUBJECT_ID.unique()[0], x)
-#         )
-
-#         # summarize the results
-#         patients = self._add_events_to_patient_dict(patients, group_df)
-#         return patients
+

    def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
-        # TODO(botelho3) fill this in to join the text descriptions to the visit.
+        # TODO(botelho3) - Note this may not be totally useable because the ITEMID
+        # uinqiue key only links to these tables using ITEMID
+        #   - INPUTEVENTS_MV
+        #   - OUTPUTEVENTS on ITEMID
+        #   - PROCEDUREEVENTS_MV on ITEMID
+        #
+        # Not to the tables we want e.g.
+
+        """Helper function which parses D_ITEMS table.
+        Will be called in `self.parse_tables()`
+        Docs:
+            - D_ITEMS: https://mimic.mit.edu/docs/iii/tables/d_items/
+        Args:
+            patients: a dict of `Patient` objects indexed by patient_id.
+        Returns:
+            The updated patients dict.
+        Note:
+            N/A
+        """
+        table = "D_ITEMS"
+        print(f"Parsing {table}")
+        assert(table in self._valid_text_tables)
+
+        # read table
+        df = pd.read_csv(
+            os.path.join(self.root, f"{table}.csv"),
+            usecols=["ITEMID", "LABEL", "CATEGORY"],
+            dtype={"ITEMID": str, "LABEL": str, "CATEGORY": str}
+        )
+
+        # drop rows with missing values
+        df = df.dropna(subset=["ITEMID", "LABEL", "CATEGORY"])
+        # sort by sequence number (i.e., priority)
+        df = df.sort_values(["ITEMID"], ascending=True)
+
+        self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')
+
+        # We haven't altered the patients array, just return it.
        return patients

+
    def parse_d_icd_procedures(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
-        # TODO(botelho3) fill this in to join the text descriptions to the visit.
-        return patients
+        """Helper function which parses D_ICD_PROCEDURES table.
+        Will be called in `self.parse_tables()`
+        Docs:
+            - D_ICD_PROCEDURES: https://mimic.mit.edu/docs/iii/tables/d_icd_procedures/
+        Args:
+            patients: a dict of `Patient` objects indexed by patient_id.
+        Returns:
+            The updated patients dict.
+        Note:
+            N/A
+        """
+        table = "D_ICD_PROCEDURES"
+        print(f"Parsing {table}")
+        assert(table in self._valid_text_tables)
+
+        # read table
+        df = pd.read_csv(
+            os.path.join(self.root, f"{table}.csv"),
+            usecols=["ICD9_CODE", "SHORT_TITLE", "LONG_TITLE"],
+            dtype={"ICD9_CODE": str, "SHORT_TITLE": str, "LONG_TITLE": str}
+        )

-```
+        # drop rows with missing values
+        df = df.dropna(subset=["ICD9_CODE", "SHORT_TITLE", "LONG_TITLE"])
+        # sort by sequence number (i.e., priority)
+        df = df.sort_values(["ICD9_CODE"], ascending=True)

-%% Output
+        # print(df.head())
+        self._text_descriptions[table] = df.reset_index(drop=True).to_dict(orient='split')
+
+        # We haven't altered the patients array, just return it.
+        return patients

-      File "/var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_27199/845818540.py", line 69
-        def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
-          ^
-    IndentationError: expected an indented block
+```

 %% Cell type:code id:427a42a6-441e-438c-96f4-b38ba82fd192 tags:

 ``` python
 mimic3base = MIMIC3DatasetWrapper(
    #root="https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III/",
    root=os.path.join(os.getcwd(), DATA_DIR_),
-    tables=["D_ITEMS", "D_ICD_PROCEDURES", "PROCEDURES_ICD", "PRESCRIPTIONS", "LABEVENTS",], # "D_ICD_DIAGNOSES", "DIAGNOSES_ICD"
+    tables=["D_ICD_DIAGNOSES", "D_ICD_PROCEDURES", "D_ITEMS", "D_LABITEMS",
+            "DIAGNOSES_ICD", "PROCEDURES_ICD", "PRESCRIPTIONS", "LABEVENTS"],
    # map all NDC codes to ATC 3-rd level codes in these tables
    # See https://en.wikipedia.org/wiki/Anatomical_Therapeutic_Chemical_Classification_System.
    code_mapping={"NDC": ("ATC", {"target_kwargs": {"level": 3}})},
+    # Slow
+    refresh_cache=True,
 )

 mimic3base.stat()
 mimic3base.info()
 ```

 %% Output

-    Parsing PATIENTS and ADMISSIONS: 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 540.68it/s]
-    Parsing PROCEDURES_ICD: 100%|███████████████████████████████████████████████████████| 113/113 [00:00<00:00, 5934.54it/s]
-    Parsing PRESCRIPTIONS: 100%|██████████████████████████████████████████████████████████| 122/122 [00:01<00:00, 99.08it/s]
-    Parsing LABEVENTS: 100%|██████████████████████████████████████████████████████████████| 129/129 [00:06<00:00, 21.36it/s]
-    Mapping codes: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 214.50it/s]
+    Parsing PATIENTS and ADMISSIONS: 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 616.21it/s]
+
+    Parsing D_ICD_DIAGNOSES
+    Parsing D_ICD_PROCEDURES
+    Parsing D_ITEMS
+    Parsing D_LABITEMS
+
+    Parsing DIAGNOSES_ICD: 100%|████████████████████████████████████████████████████████| 129/129 [00:00<00:00, 5045.37it/s]
+    Parsing PROCEDURES_ICD: 100%|███████████████████████████████████████████████████████| 113/113 [00:00<00:00, 6525.72it/s]
+    Parsing PRESCRIPTIONS: 100%|█████████████████████████████████████████████████████████| 122/122 [00:00<00:00, 136.63it/s]
+    Parsing LABEVENTS: 100%|██████████████████████████████████████████████████████████████| 129/129 [00:05<00:00, 25.78it/s]
+    Mapping codes: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 244.02it/s]

    
    Statistics of base dataset (dev=False):
    	- Dataset: MIMIC3DatasetWrapper
    	- Number of patients: 100
    	- Number of visits: 129
    	- Number of visits per patient: 1.2900
-    	- Number of events per visit in D_ITEMS: 0.0000
+    	- Number of events per visit in D_ICD_DIAGNOSES: 0.0000
    	- Number of events per visit in D_ICD_PROCEDURES: 0.0000
+    	- Number of events per visit in D_ITEMS: 0.0000
+    	- Number of events per visit in D_LABITEMS: 0.0000
+    	- Number of events per visit in DIAGNOSES_ICD: 13.6512
    	- Number of events per visit in PROCEDURES_ICD: 3.9225
    	- Number of events per visit in PRESCRIPTIONS: 115.6667
    	- Number of events per visit in LABEVENTS: 479.1628
    
    
    dataset.patients: patient_id -> <Patient>
    
    <Patient>
        - visits: visit_id -> <Visit>
        - other patient-level info
    
        <Visit>
            - event_list_dict: table_name -> List[Event]
            - other visit-level info
    
            <Event>
                - code: str
                - other event-level info
    

+%% Cell type:code id:4cef4476-6bc8-4791-b246-4c329a80a2e4 tags:
+
+``` python
+table_names = mimic3base.get_all_tables()
+print(table_names)
+
+for t in table_names:
+    d = mimic3base.get_text_dict(t)
+    print(f"Table: {t}")
+    print(d['data'][:5])
+    print('\n\n')
+```
+
+%% Output
+
+    dict_keys(['D_ICD_DIAGNOSES', 'D_ITEMS', 'D_ICD_PROCEDURES', 'D_LABITEMS'])
+    Table: D_ICD_DIAGNOSES
+    [['0010', 'Cholera d/t vib cholerae', 'Cholera due to vibrio cholerae'], ['0011', 'Cholera d/t vib el tor', 'Cholera due to vibrio cholerae el tor'], ['0019', 'Cholera NOS', 'Cholera, unspecified'], ['0020', 'Typhoid fever', 'Typhoid fever'], ['0021', 'Paratyphoid fever a', 'Paratyphoid fever A']]
+    
+    
+    
+    Table: D_ITEMS
+    [['1126', 'Art.pH', 'ABG'], ['1127', 'WBC   (4-11,000)', 'Hematology'], ['1520', 'ACT', 'Coags'], ['1521', 'Albumin', 'Chemistry'], ['1522', 'Calcium', 'Chemistry']]
+    
+    
+    
+    Table: D_ICD_PROCEDURES
+    [['0001', 'Ther ult head & neck ves', 'Therapeutic ultrasound of vessels of head and neck'], ['0002', 'Ther ultrasound of heart', 'Therapeutic ultrasound of heart'], ['0003', 'Ther ult peripheral ves', 'Therapeutic ultrasound of peripheral vascular vessels'], ['0009', 'Other therapeutic ultsnd', 'Other therapeutic ultrasound'], ['0010', 'Implant chemothera agent', 'Implantation of chemotherapeutic agent']]
+    
+    
+    
+    Table: D_LABITEMS
+    [['50800', 'SPECIMEN TYPE', 'BLOOD', 'BLOOD GAS'], ['50801', 'Alveolar-arterial Gradient', 'Blood', 'Blood Gas'], ['50802', 'Base Excess', 'Blood', 'Blood Gas'], ['50803', 'Calculated Bicarbonate, Whole Blood', 'Blood', 'Blood Gas'], ['50804', 'Calculated Total CO2', 'Blood', 'Blood Gas']]
+    
+    
+    
+
 %% Cell type:markdown id:b02b8e28-23ac-4ac1-a9b6-ae546e28c905 tags:

 **Tasks**

 Declare tasks for 2 of the 5 prediction tasks specified in the paper. We will create dataloaders for each task that contain the ICD codes and the raw text for each (patient, visit).

 %% Cell type:code id:08692aad-db67-487b-9b26-dbde0ab6adce tags:

 ``` python
 # The original authors tackled 5 tasks
 #   1. readmission
 #   2. mortality
 #   3. an ICU stay exceeding three days
 #   4. an ICU stay exceeding seven days
 #   5. diagnosis prediction

 def readmission_pred_task(patient):
    """
    patient is a <pyhealth.data.Patient> object
    """
    samples = []

    # loop over all visits but the last one
    for i in range(len(patient) - 1):

        # visit and next_visit are both <pyhealth.data.Visit> objects
        visit = patient[i]
        next_visit = patient[i + 1]

        # step 1: define the mortality_label
        if next_visit.discharge_status not in [0, 1]:
            mortality_label = 0
        else:
            mortality_label = int(next_visit.discharge_status)

        # step 2: get code-based feature information
        conditions = visit.get_code_list(table="DIAGNOSES_ICD")
        procedures = visit.get_code_list(table="PROCEDURES_ICD")
        drugs = visit.get_code_list(table="PRESCRIPTIONS")
        labevents = visit.get_code_list(table="LABEVENTS")

        # step 3: exclusion criteria: visits without condition, procedure, or drug
        if len(conditions) * len(procedures) * len(drugs) == 0: continue

        # step 4: assemble the samples
        samples.append(
            {
                "visit_id": visit.visit_id,
                "patient_id": patient.patient_id,
                # the following keys can be the "feature_keys" or "label_key" for initializing downstream ML model
                "conditions": conditions,
                "procedures": procedures,
                "drugs": drugs,
                "label": mortality_label,
            }
        )
    return samples

 def mortality_pred_task(patient):
    """
    patient is a <pyhealth.data.Patient> object
    """
    samples = []

    # loop over all visits but the last one
    for i in range(len(patient) - 1):

        # visit and next_visit are both <pyhealth.data.Visit> objects
        visit = patient[i]
        next_visit = patient[i + 1]

        # step 1: define the mortality_label
        if next_visit.discharge_status not in [0, 1]:
            mortality_label = 0
        else:
            mortality_label = int(next_visit.discharge_status)

        # step 2: get code-based feature information
        conditions = visit.get_code_list(table="DIAGNOSES_ICD")
        procedures = visit.get_code_list(table="PROCEDURES_ICD")
        drugs = visit.get_code_list(table="PRESCRIPTIONS")

        # step 3: exclusion criteria: visits without condition, procedure, or drug
        if len(conditions) * len(procedures) * len(drugs) == 0: continue

        # step 4: assemble the samples
        samples.append(
            {
                "visit_id": visit.visit_id,
                "patient_id": patient.patient_id,
                # the following keys can be the "feature_keys" or "label_key" for initializing downstream ML model
                "conditions": conditions,
                "procedures": procedures,
                "drugs": drugs,
                "label": mortality_label,
            }
        )
    return samples


 # mimic3sample = mimic3base.set_task(task_fn=drug_recommendation_mimic3_fn) # use default task
 # train_ds, val_ds, test_ds = split_by_patient(mimic3sample, [0.8, 0.1, 0.1])
 ```

 %% Cell type:code id:96d047ff-6412-462c-9294-a5ba2d3bdba2 tags:

 ``` python
 re_dataset = dataset.set_task(readmission_pred_task)
 re_dataset.stat()
 ```

 %% Cell type:code id:70586b66-a814-4898-892b-d1bdd339ce7b tags:

 ``` python
 mor_dataset = dataset.set_task(mortality_pred_task)
 mor_dataset.stat()
 ```

 %% Cell type:code id:1056b60a-5861-4e47-b694-891053cc8470 tags:

 ``` python
 # create dataloaders (torch.data.DataLoader)
 train_loader = get_dataloader(train_ds, batch_size=32, shuffle=True)
 val_loader = get_dataloader(val_ds, batch_size=32, shuffle=False)
 test_loader = get_dataloader(test_ds, batch_size=32, shuffle=False)
 ```

 %% Cell type:markdown id:0f9b59b9-4bdb-44c4-b155-0a2a2d0f8d07 tags:

 ### Embed MIMIC III Data using BERT

 %% Cell type:code id:1f5e0b1b-d75c-42ee-8d98-879f14b04606 tags:

 ``` python
 ```