Notebook: add pyHealth dataset parsing (incomplete)

Add a custom wrapper around the MIMIC3Dataset pyHealth class that add functions to process the D_* text tables to extract the natural language we need for DescEmb. Processing functions are not complete. TODO actually write these. Force MIMIC3Dataset to load from local files using global `DATA_DIR_` Added skeleton for 2 pyHealth tasks readmission_pred_task, diagnosis_pred_task. These are from the tutorial and they will be necessary to take the output of MIMIC3DatasetWrapper and turn it into a test/train dataset. TODO - incorporate the text data into the output of these tasks TODO - hook up BERT model to some fake pasted strings of what we expect the dataset to look like to verify it runs as we expect.

Notebook: add pyHealth dataset parsing (incomplete)
be496e89 · aBotel · a4e89fe7 · be496e89
Commit be496e89 authored 2 years ago by aBotel
--- a/src/cs598_dlh_final_project.ipynb
+++ b/src/cs598_dlh_final_project.ipynb
 {
 "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3225717c-458b-45cb-9e8b-89ecd03b9d07",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "markdown",
   "id": "7ccba6af-1867-491d-b9d0-d99965738c64",
@@ -18,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
   "id": "5da368cd-3045-4ec0-86fb-2ce15b5d1b92",
   "metadata": {},
   "outputs": [],
@@ -26,14 +18,18 @@
    "# General includes.\n",
    "import os\n",
    "\n",
+    "# Typing includes.\n",
+    "from typing import Dict, List, Optional\n",
+    "\n",
    "# Numerical includes.\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "from torch import nn\n",
    "\n",
-    "# Pyhealth includes.\n",
-    "from pyhealth.datasets import MIMIC3Dataset"
+    "# pyHealth includes.\n",
+    "from pyhealth.datasets import MIMIC3Dataset\n",
+    "from pyhealth.data import Patient, Visit, Event"
   ]
  },
  {
@@ -57,12 +53,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 22,
   "id": "ebef3528-0396-459a-8112-b272089f5d67",
   "metadata": {},
   "outputs": [],
   "source": [
-    "USE_GPU_ = False"
+    "USE_GPU_ = False\n",
+    "GPU_STRING_ = 'cuda'\n",
+    "DATA_DIR_ = '../data_input_path/mimic'"
   ]
  },
  {
@@ -70,7 +68,7 @@
   "id": "df0f8271-44b2-44a2-b7cc-cd7339a70e87",
   "metadata": {},
   "source": [
-    "## Preprocessing"
+    "# Preprocessing"
   ]
  },
  {
@@ -176,9 +174,9 @@
    "\n",
    "# If you have a GPU, put everything on cuda\n",
    "if USE_GPU_:\n",
-    "    tokens_tensor = tokens_tensor.to('cuda')\n",
-    "    segments_tensors = segments_tensors.to('cuda')\n",
-    "    model.to('cuda')\n",
+    "    tokens_tensor = tokens_tensor.to(GPU_STRING_)\n",
+    "    segments_tensors = segments_tensors.to(GPU_STRING_)\n",
+    "    model.to(GPU_STRING_)\n",
    "\n",
    "# Predict all tokens\n",
    "with torch.no_grad():\n",
@@ -227,16 +225,302 @@
   "id": "8f0d5fa9-226e-4612-b28f-ed24de16399a",
   "metadata": {},
   "source": [
-    "### Load "
+    "### Load MIMIC III Data"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
+   "id": "b881e548-4d27-4725-8c47-b2612157929e",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "IndentationError",
+     "evalue": "expected an indented block (845818540.py, line 69)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  File \u001b[0;32m\"/var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_27199/845818540.py\"\u001b[0;36m, line \u001b[0;32m69\u001b[0m\n\u001b[0;31m    def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:\u001b[0m\n\u001b[0m      ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n"
+     ]
+    }
+   ],
+   "source": [
+    "class MIMIC3DatasetWrapper(MIMIC3Dataset):\n",
+    "    ''' Add extra tables to the MIMIC III dataset.\n",
+    "    \n",
+    "      Some of the tables we need like \"D_ICD_DIAGNOSES\", \"D_ITEMS\", \"D_ICD_PROCEDURES\"\n",
+    "      are not supported out of the box. \n",
+    "      \n",
+    "      This class defines parsing methods to extract text data from these extra tables.\n",
+    "      The text data is generally joined on the PATIENTID, HADMID, ITEMID to match the\n",
+    "      pyHealth Vists class representation.\n",
+    "    '''\n",
+    "    \n",
+    "    # Skip init and defer to base class.\n",
+    "    \n",
+    "    # Note the name has to match the table name exactly.\n",
+    "    # See https://github.com/sunlabuiuc/PyHealth/blob/master/pyhealth/datasets/mimic3.py#L71.\n",
+    "    def parse_d_icd_diagnoses(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n",
+    "        # TODO(botelho3) fill this in to join the text descriptions to the visit.\n",
+    "        return patients\n",
+    "#                 \"\"\"Helper function which parses DIAGNOSES_ICD table.\n",
+    "#         Will be called in `self.parse_tables()`\n",
+    "#         Docs:\n",
+    "#             - DIAGNOSES_ICD: https://mimic.mit.edu/docs/iii/tables/diagnoses_icd/\n",
+    "#         Args:\n",
+    "#             patients: a dict of `Patient` objects indexed by patient_id.\n",
+    "#         Returns:\n",
+    "#             The updated patients dict.\n",
+    "#         Note:\n",
+    "#             MIMIC-III does not provide specific timestamps in DIAGNOSES_ICD\n",
+    "#                 table, so we set it to None.\n",
+    "#         \"\"\"\n",
+    "#         table = \"DIAGNOSES_ICD\"\n",
+    "#         # read table\n",
+    "#         df = pd.read_csv(\n",
+    "#             os.path.join(self.root, f\"{table}.csv\"),\n",
+    "#             dtype={\"SUBJECT_ID\": str, \"HADM_ID\": str, \"ICD9_CODE\": str},\n",
+    "#         )\n",
+    "#         # drop records of the other patients\n",
+    "#         df = df[df[\"SUBJECT_ID\"].isin(patients.keys())]\n",
+    "#         # drop rows with missing values\n",
+    "#         df = df.dropna(subset=[\"SUBJECT_ID\", \"HADM_ID\", \"ICD9_CODE\"])\n",
+    "#         # sort by sequence number (i.e., priority)\n",
+    "#         df = df.sort_values([\"SUBJECT_ID\", \"HADM_ID\", \"SEQ_NUM\"], ascending=True)\n",
+    "#         # group by patient and visit\n",
+    "#         group_df = df.groupby(\"SUBJECT_ID\")\n",
+    "\n",
+    "#         # parallel unit of diagnosis (per patient)\n",
+    "#         def diagnosis_unit(p_id, p_info):\n",
+    "#             events = []\n",
+    "#             for v_id, v_info in p_info.groupby(\"HADM_ID\"):\n",
+    "#                 for code in v_info[\"ICD9_CODE\"]:\n",
+    "#                     event = Event(\n",
+    "#                         code=code,\n",
+    "#                         table=table,\n",
+    "#                         vocabulary=\"ICD9CM\",\n",
+    "#                         visit_id=v_id,\n",
+    "#                         patient_id=p_id,\n",
+    "#                     )\n",
+    "#                     events.append(event)\n",
+    "#             return events\n",
+    "\n",
+    "#         # parallel apply\n",
+    "#         group_df = group_df.parallel_apply(\n",
+    "#             lambda x: diagnosis_unit(x.SUBJECT_ID.unique()[0], x)\n",
+    "#         )\n",
+    "\n",
+    "#         # summarize the results\n",
+    "#         patients = self._add_events_to_patient_dict(patients, group_df)\n",
+    "#         return patients\n",
+    "    \n",
+    "    def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n",
+    "        # TODO(botelho3) fill this in to join the text descriptions to the visit.\n",
+    "        return patients\n",
+    "    \n",
+    "    def parse_d_icd_procedures(self, patients: Dict[str, Patient]) -> Dict[str, Patient]: \n",
+    "        # TODO(botelho3) fill this in to join the text descriptions to the visit.\n",
+    "        return patients\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
   "id": "427a42a6-441e-438c-96f4-b38ba82fd192",
   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parsing PATIENTS and ADMISSIONS: 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 540.68it/s]\n",
+      "Parsing PROCEDURES_ICD: 100%|███████████████████████████████████████████████████████| 113/113 [00:00<00:00, 5934.54it/s]\n",
+      "Parsing PRESCRIPTIONS: 100%|██████████████████████████████████████████████████████████| 122/122 [00:01<00:00, 99.08it/s]\n",
+      "Parsing LABEVENTS: 100%|██████████████████████████████████████████████████████████████| 129/129 [00:06<00:00, 21.36it/s]\n",
+      "Mapping codes: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 214.50it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Statistics of base dataset (dev=False):\n",
+      "\t- Dataset: MIMIC3DatasetWrapper\n",
+      "\t- Number of patients: 100\n",
+      "\t- Number of visits: 129\n",
+      "\t- Number of visits per patient: 1.2900\n",
+      "\t- Number of events per visit in D_ITEMS: 0.0000\n",
+      "\t- Number of events per visit in D_ICD_PROCEDURES: 0.0000\n",
+      "\t- Number of events per visit in PROCEDURES_ICD: 3.9225\n",
+      "\t- Number of events per visit in PRESCRIPTIONS: 115.6667\n",
+      "\t- Number of events per visit in LABEVENTS: 479.1628\n",
+      "\n",
+      "\n",
+      "dataset.patients: patient_id -> <Patient>\n",
+      "\n",
+      "<Patient>\n",
+      "    - visits: visit_id -> <Visit> \n",
+      "    - other patient-level info\n",
+      "    \n",
+      "    <Visit>\n",
+      "        - event_list_dict: table_name -> List[Event]\n",
+      "        - other visit-level info\n",
+      "    \n",
+      "        <Event>\n",
+      "            - code: str\n",
+      "            - other event-level info\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "mimic3base = MIMIC3DatasetWrapper(\n",
+    "    #root=\"https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III/\",\n",
+    "    root=os.path.join(os.getcwd(), DATA_DIR_),\n",
+    "    tables=[\"D_ITEMS\", \"D_ICD_PROCEDURES\", \"PROCEDURES_ICD\", \"PRESCRIPTIONS\", \"LABEVENTS\",], # \"D_ICD_DIAGNOSES\", \"DIAGNOSES_ICD\"\n",
+    "    # map all NDC codes to ATC 3-rd level codes in these tables\n",
+    "    # See https://en.wikipedia.org/wiki/Anatomical_Therapeutic_Chemical_Classification_System.\n",
+    "    code_mapping={\"NDC\": (\"ATC\", {\"target_kwargs\": {\"level\": 3}})},\n",
+    ")\n",
+    "\n",
+    "mimic3base.stat()\n",
+    "mimic3base.info()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b02b8e28-23ac-4ac1-a9b6-ae546e28c905",
+   "metadata": {},
+   "source": [
+    "**Tasks**\n",
+    "\n",
+    "Declare tasks for 2 of the 5 prediction tasks specified in the paper. We will create dataloaders for each task that contain the ICD codes and the raw text for each (patient, visit)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08692aad-db67-487b-9b26-dbde0ab6adce",
+   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "# The original authors tackled 5 tasks\n",
+    "#   1. readmission\n",
+    "#   2. mortality\n",
+    "#   3. an ICU stay exceeding three days\n",
+    "#   4. an ICU stay exceeding seven days\n",
+    "#   5. diagnosis prediction\n",
+    "\n",
+    "def readmission_pred_task(patient):\n",
+    "    \"\"\"\n",
+    "    patient is a <pyhealth.data.Patient> object\n",
+    "    \"\"\"\n",
+    "    samples = []\n",
+    "\n",
+    "    # loop over all visits but the last one\n",
+    "    for i in range(len(patient) - 1):\n",
+    "\n",
+    "        # visit and next_visit are both <pyhealth.data.Visit> objects\n",
+    "        visit = patient[i]\n",
+    "        next_visit = patient[i + 1]\n",
+    "\n",
+    "        # step 1: define the mortality_label\n",
+    "        if next_visit.discharge_status not in [0, 1]:\n",
+    "            mortality_label = 0\n",
+    "        else:\n",
+    "            mortality_label = int(next_visit.discharge_status)\n",
+    "\n",
+    "        # step 2: get code-based feature information\n",
+    "        conditions = visit.get_code_list(table=\"DIAGNOSES_ICD\")\n",
+    "        procedures = visit.get_code_list(table=\"PROCEDURES_ICD\")\n",
+    "        drugs = visit.get_code_list(table=\"PRESCRIPTIONS\")\n",
+    "        labevents = visit.get_code_list(table=\"LABEVENTS\")\n",
+    "\n",
+    "        # step 3: exclusion criteria: visits without condition, procedure, or drug\n",
+    "        if len(conditions) * len(procedures) * len(drugs) == 0: continue\n",
+    "        \n",
+    "        # step 4: assemble the samples\n",
+    "        samples.append(\n",
+    "            {\n",
+    "                \"visit_id\": visit.visit_id,\n",
+    "                \"patient_id\": patient.patient_id,\n",
+    "                # the following keys can be the \"feature_keys\" or \"label_key\" for initializing downstream ML model\n",
+    "                \"conditions\": conditions,\n",
+    "                \"procedures\": procedures,\n",
+    "                \"drugs\": drugs,\n",
+    "                \"label\": mortality_label,\n",
+    "            }\n",
+    "        )\n",
+    "    return samples\n",
+    "\n",
+    "def mortality_pred_task(patient):\n",
+    "    \"\"\"\n",
+    "    patient is a <pyhealth.data.Patient> object\n",
+    "    \"\"\"\n",
+    "    samples = []\n",
+    "\n",
+    "    # loop over all visits but the last one\n",
+    "    for i in range(len(patient) - 1):\n",
+    "\n",
+    "        # visit and next_visit are both <pyhealth.data.Visit> objects\n",
+    "        visit = patient[i]\n",
+    "        next_visit = patient[i + 1]\n",
+    "\n",
+    "        # step 1: define the mortality_label\n",
+    "        if next_visit.discharge_status not in [0, 1]:\n",
+    "            mortality_label = 0\n",
+    "        else:\n",
+    "            mortality_label = int(next_visit.discharge_status)\n",
+    "\n",
+    "        # step 2: get code-based feature information\n",
+    "        conditions = visit.get_code_list(table=\"DIAGNOSES_ICD\")\n",
+    "        procedures = visit.get_code_list(table=\"PROCEDURES_ICD\")\n",
+    "        drugs = visit.get_code_list(table=\"PRESCRIPTIONS\")\n",
+    "\n",
+    "        # step 3: exclusion criteria: visits without condition, procedure, or drug\n",
+    "        if len(conditions) * len(procedures) * len(drugs) == 0: continue\n",
+    "        \n",
+    "        # step 4: assemble the samples\n",
+    "        samples.append(\n",
+    "            {\n",
+    "                \"visit_id\": visit.visit_id,\n",
+    "                \"patient_id\": patient.patient_id,\n",
+    "                # the following keys can be the \"feature_keys\" or \"label_key\" for initializing downstream ML model\n",
+    "                \"conditions\": conditions,\n",
+    "                \"procedures\": procedures,\n",
+    "                \"drugs\": drugs,\n",
+    "                \"label\": mortality_label,\n",
+    "            }\n",
+    "        )\n",
+    "    return samples\n",
+    "\n",
+    "\n",
+    "# mimic3sample = mimic3base.set_task(task_fn=drug_recommendation_mimic3_fn) # use default task\n",
+    "# train_ds, val_ds, test_ds = split_by_patient(mimic3sample, [0.8, 0.1, 0.1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96d047ff-6412-462c-9294-a5ba2d3bdba2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "re_dataset = dataset.set_task(readmission_pred_task)\n",
+    "re_dataset.stat()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70586b66-a814-4898-892b-d1bdd339ce7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mor_dataset = dataset.set_task(mortality_pred_task)\n",
+    "mor_dataset.stat()"
+   ]
  },
  {
   "cell_type": "code",
@@ -244,6 +528,27 @@
   "id": "1056b60a-5861-4e47-b694-891053cc8470",
   "metadata": {},
   "outputs": [],
+   "source": [
+    "# create dataloaders (torch.data.DataLoader)\n",
+    "train_loader = get_dataloader(train_ds, batch_size=32, shuffle=True)\n",
+    "val_loader = get_dataloader(val_ds, batch_size=32, shuffle=False)\n",
+    "test_loader = get_dataloader(test_ds, batch_size=32, shuffle=False)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f9b59b9-4bdb-44c4-b155-0a2a2d0f8d07",
+   "metadata": {},
+   "source": [
+    "### Embed MIMIC III Data using BERT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f5e0b1b-d75c-42ee-8d98-879f14b04606",
+   "metadata": {},
+   "outputs": [],
   "source": []
  }
 ],

-%% Cell type:code id:3225717c-458b-45cb-9e8b-89ecd03b9d07 tags:
-
-``` python
-```
-
 %% Cell type:markdown id:7ccba6af-1867-491d-b9d0-d99965738c64 tags:

 # Imports

 %% Cell type:code id:5da368cd-3045-4ec0-86fb-2ce15b5d1b92 tags:

 ``` python
 # General includes.
 import os

+# Typing includes.
+from typing import Dict, List, Optional
+
 # Numerical includes.
 import numpy as np
 import pandas as pd
 import torch
 from torch import nn

-# Pyhealth includes.
+# pyHealth includes.
 from pyhealth.datasets import MIMIC3Dataset
+from pyhealth.data import Patient, Visit, Event
 ```

 %% Cell type:code id:95e714b1-ca5d-4d7b-9ace-6b4372adfd9f tags:

 ``` python
 # Model imports
 from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
 ```

 %% Cell type:markdown id:cf83fa50-9dd0-467b-bd36-634795e4a09c tags:

 # Globals

 %% Cell type:code id:ebef3528-0396-459a-8112-b272089f5d67 tags:

 ``` python
 USE_GPU_ = False
+GPU_STRING_ = 'cuda'
+DATA_DIR_ = '../data_input_path/mimic'
 ```

 %% Cell type:markdown id:df0f8271-44b2-44a2-b7cc-cd7339a70e87 tags:

-## Preprocessing
+# Preprocessing

 %% Cell type:code id:02624f53-6cee-4db0-8b15-e08b755a04f2 tags:

 ``` python
 ```

 %% Cell type:markdown id:ebe2c2d9-c161-4b9d-8182-86384faea929 tags:

 ### Load BERT Model

 %% Cell type:markdown id:39b0edec-1570-4640-9926-d908b456f957 tags:

 See instructions here:
 - https://pypi.org/project/pytorch-pretrained-bert/#examples
 - https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
 - Could also get it from pytorch transformers library: https://pytorch.org/hub/huggingface_pytorch-transformers/

 %% Cell type:code id:f40665f4-db92-4567-b0f9-001bd35e9284 tags:

 ``` python

 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


 # TODO(botelho3)`bert-base-uncased` is big. Load `bert-tiny` instead from the filesystem?
 # Model available at https://huggingface.co/prajjwal1/bert-tiny.
 # model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)

 # Tokenized input
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
 tokenized_text = tokenizer.tokenize(text)

 # Mask a token that we will try to predict back with `BertForMaskedLM`
 masked_index = 8
 tokenized_text[masked_index] = '[MASK]'
 assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

 # Convert token to vocabulary indices
 indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
 # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
 segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

 # Convert inputs to PyTorch tensors
 tokens_tensor = torch.tensor([indexed_tokens])
 segments_tensors = torch.tensor([segments_ids])
 ```

 %% Output

    100%|█████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 877100.29B/s]

 %% Cell type:code id:a7539b72-a871-4d52-947b-075b5c4be2ae tags:

 ``` python
 ```

 %% Cell type:code id:de132943-2c3e-405b-b2f8-5d348472672c tags:

 ``` python
 # Load pre-trained model (weights)
 model = BertForMaskedLM.from_pretrained('bert-base-uncased')
 model.eval()

 # If you have a GPU, put everything on cuda
 if USE_GPU_:
-    tokens_tensor = tokens_tensor.to('cuda')
-    segments_tensors = segments_tensors.to('cuda')
-    model.to('cuda')
+    tokens_tensor = tokens_tensor.to(GPU_STRING_)
+    segments_tensors = segments_tensors.to(GPU_STRING_)
+    model.to(GPU_STRING_)

 # Predict all tokens
 with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)

 # confirm we were able to predict 'henson'
 predicted_index = torch.argmax(predictions[0, masked_index]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == 'henson'
 ```

 %% Output

    ---------------------------------------------------------------------------
    TypeError                                 Traceback (most recent call last)
    /var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_23031/2203343201.py in <module>
         14
         15 # confirm we were able to predict 'henson'
    ---> 16 predicted_index = torch.argmax(predictions[0, masked_index]).item()
         17 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
         18 assert predicted_token == 'henson'
    TypeError: tuple indices must be integers or slices, not tuple

 %% Cell type:code id:02efb18a-626d-42d4-b19d-ba4b8f752a00 tags:

 ``` python
 ```

 %% Cell type:code id:0e2d58de-bf9a-4b57-977f-6aa3ac75e84f tags:

 ``` python
 # https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
 class BERTClassification(nn.Module):
    def __init__ (self):
        super(BERTClassification, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.bert_drop = nn.Dropout(0.4)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, pooledOut = self.bert(ids, attention_mask = mask,
                                token_type_ids=token_type_ids)
        bertOut = self.bert_drop(pooledOut)
        output = self.out(bertOut)

        return output
 ```

 %% Cell type:markdown id:8f0d5fa9-226e-4612-b28f-ed24de16399a tags:

-### Load
+### Load MIMIC III Data
+
+%% Cell type:code id:b881e548-4d27-4725-8c47-b2612157929e tags:
+
+``` python
+class MIMIC3DatasetWrapper(MIMIC3Dataset):
+    ''' Add extra tables to the MIMIC III dataset.
+
+      Some of the tables we need like "D_ICD_DIAGNOSES", "D_ITEMS", "D_ICD_PROCEDURES"
+      are not supported out of the box.
+
+      This class defines parsing methods to extract text data from these extra tables.
+      The text data is generally joined on the PATIENTID, HADMID, ITEMID to match the
+      pyHealth Vists class representation.
+    '''
+
+    # Skip init and defer to base class.
+
+    # Note the name has to match the table name exactly.
+    # See https://github.com/sunlabuiuc/PyHealth/blob/master/pyhealth/datasets/mimic3.py#L71.
+    def parse_d_icd_diagnoses(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
+        # TODO(botelho3) fill this in to join the text descriptions to the visit.
+        return patients
+#                 """Helper function which parses DIAGNOSES_ICD table.
+#         Will be called in `self.parse_tables()`
+#         Docs:
+#             - DIAGNOSES_ICD: https://mimic.mit.edu/docs/iii/tables/diagnoses_icd/
+#         Args:
+#             patients: a dict of `Patient` objects indexed by patient_id.
+#         Returns:
+#             The updated patients dict.
+#         Note:
+#             MIMIC-III does not provide specific timestamps in DIAGNOSES_ICD
+#                 table, so we set it to None.
+#         """
+#         table = "DIAGNOSES_ICD"
+#         # read table
+#         df = pd.read_csv(
+#             os.path.join(self.root, f"{table}.csv"),
+#             dtype={"SUBJECT_ID": str, "HADM_ID": str, "ICD9_CODE": str},
+#         )
+#         # drop records of the other patients
+#         df = df[df["SUBJECT_ID"].isin(patients.keys())]
+#         # drop rows with missing values
+#         df = df.dropna(subset=["SUBJECT_ID", "HADM_ID", "ICD9_CODE"])
+#         # sort by sequence number (i.e., priority)
+#         df = df.sort_values(["SUBJECT_ID", "HADM_ID", "SEQ_NUM"], ascending=True)
+#         # group by patient and visit
+#         group_df = df.groupby("SUBJECT_ID")
+
+#         # parallel unit of diagnosis (per patient)
+#         def diagnosis_unit(p_id, p_info):
+#             events = []
+#             for v_id, v_info in p_info.groupby("HADM_ID"):
+#                 for code in v_info["ICD9_CODE"]:
+#                     event = Event(
+#                         code=code,
+#                         table=table,
+#                         vocabulary="ICD9CM",
+#                         visit_id=v_id,
+#                         patient_id=p_id,
+#                     )
+#                     events.append(event)
+#             return events
+
+#         # parallel apply
+#         group_df = group_df.parallel_apply(
+#             lambda x: diagnosis_unit(x.SUBJECT_ID.unique()[0], x)
+#         )
+
+#         # summarize the results
+#         patients = self._add_events_to_patient_dict(patients, group_df)
+#         return patients
+
+    def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
+        # TODO(botelho3) fill this in to join the text descriptions to the visit.
+        return patients
+
+    def parse_d_icd_procedures(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
+        # TODO(botelho3) fill this in to join the text descriptions to the visit.
+        return patients
+
+```
+
+%% Output
+
+      File "/var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_27199/845818540.py", line 69
+        def parse_d_items(self, patients: Dict[str, Patient]) -> Dict[str, Patient]:
+          ^
+    IndentationError: expected an indented block

 %% Cell type:code id:427a42a6-441e-438c-96f4-b38ba82fd192 tags:

 ``` python
+mimic3base = MIMIC3DatasetWrapper(
+    #root="https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III/",
+    root=os.path.join(os.getcwd(), DATA_DIR_),
+    tables=["D_ITEMS", "D_ICD_PROCEDURES", "PROCEDURES_ICD", "PRESCRIPTIONS", "LABEVENTS",], # "D_ICD_DIAGNOSES", "DIAGNOSES_ICD"
+    # map all NDC codes to ATC 3-rd level codes in these tables
+    # See https://en.wikipedia.org/wiki/Anatomical_Therapeutic_Chemical_Classification_System.
+    code_mapping={"NDC": ("ATC", {"target_kwargs": {"level": 3}})},
+)
+
+mimic3base.stat()
+mimic3base.info()
+```
+
+%% Output
+
+    Parsing PATIENTS and ADMISSIONS: 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 540.68it/s]
+    Parsing PROCEDURES_ICD: 100%|███████████████████████████████████████████████████████| 113/113 [00:00<00:00, 5934.54it/s]
+    Parsing PRESCRIPTIONS: 100%|██████████████████████████████████████████████████████████| 122/122 [00:01<00:00, 99.08it/s]
+    Parsing LABEVENTS: 100%|██████████████████████████████████████████████████████████████| 129/129 [00:06<00:00, 21.36it/s]
+    Mapping codes: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 214.50it/s]
+
+    
+    Statistics of base dataset (dev=False):
+    	- Dataset: MIMIC3DatasetWrapper
+    	- Number of patients: 100
+    	- Number of visits: 129
+    	- Number of visits per patient: 1.2900
+    	- Number of events per visit in D_ITEMS: 0.0000
+    	- Number of events per visit in D_ICD_PROCEDURES: 0.0000
+    	- Number of events per visit in PROCEDURES_ICD: 3.9225
+    	- Number of events per visit in PRESCRIPTIONS: 115.6667
+    	- Number of events per visit in LABEVENTS: 479.1628
+    
+    
+    dataset.patients: patient_id -> <Patient>
+    
+    <Patient>
+        - visits: visit_id -> <Visit>
+        - other patient-level info
+    
+        <Visit>
+            - event_list_dict: table_name -> List[Event]
+            - other visit-level info
+    
+            <Event>
+                - code: str
+                - other event-level info
+    
+
+%% Cell type:markdown id:b02b8e28-23ac-4ac1-a9b6-ae546e28c905 tags:
+
+**Tasks**
+
+Declare tasks for 2 of the 5 prediction tasks specified in the paper. We will create dataloaders for each task that contain the ICD codes and the raw text for each (patient, visit).
+
+%% Cell type:code id:08692aad-db67-487b-9b26-dbde0ab6adce tags:
+
+``` python
+# The original authors tackled 5 tasks
+#   1. readmission
+#   2. mortality
+#   3. an ICU stay exceeding three days
+#   4. an ICU stay exceeding seven days
+#   5. diagnosis prediction
+
+def readmission_pred_task(patient):
+    """
+    patient is a <pyhealth.data.Patient> object
+    """
+    samples = []
+
+    # loop over all visits but the last one
+    for i in range(len(patient) - 1):
+
+        # visit and next_visit are both <pyhealth.data.Visit> objects
+        visit = patient[i]
+        next_visit = patient[i + 1]
+
+        # step 1: define the mortality_label
+        if next_visit.discharge_status not in [0, 1]:
+            mortality_label = 0
+        else:
+            mortality_label = int(next_visit.discharge_status)
+
+        # step 2: get code-based feature information
+        conditions = visit.get_code_list(table="DIAGNOSES_ICD")
+        procedures = visit.get_code_list(table="PROCEDURES_ICD")
+        drugs = visit.get_code_list(table="PRESCRIPTIONS")
+        labevents = visit.get_code_list(table="LABEVENTS")
+
+        # step 3: exclusion criteria: visits without condition, procedure, or drug
+        if len(conditions) * len(procedures) * len(drugs) == 0: continue
+
+        # step 4: assemble the samples
+        samples.append(
+            {
+                "visit_id": visit.visit_id,
+                "patient_id": patient.patient_id,
+                # the following keys can be the "feature_keys" or "label_key" for initializing downstream ML model
+                "conditions": conditions,
+                "procedures": procedures,
+                "drugs": drugs,
+                "label": mortality_label,
+            }
+        )
+    return samples
+
+def mortality_pred_task(patient):
+    """
+    patient is a <pyhealth.data.Patient> object
+    """
+    samples = []
+
+    # loop over all visits but the last one
+    for i in range(len(patient) - 1):
+
+        # visit and next_visit are both <pyhealth.data.Visit> objects
+        visit = patient[i]
+        next_visit = patient[i + 1]
+
+        # step 1: define the mortality_label
+        if next_visit.discharge_status not in [0, 1]:
+            mortality_label = 0
+        else:
+            mortality_label = int(next_visit.discharge_status)
+
+        # step 2: get code-based feature information
+        conditions = visit.get_code_list(table="DIAGNOSES_ICD")
+        procedures = visit.get_code_list(table="PROCEDURES_ICD")
+        drugs = visit.get_code_list(table="PRESCRIPTIONS")
+
+        # step 3: exclusion criteria: visits without condition, procedure, or drug
+        if len(conditions) * len(procedures) * len(drugs) == 0: continue
+
+        # step 4: assemble the samples
+        samples.append(
+            {
+                "visit_id": visit.visit_id,
+                "patient_id": patient.patient_id,
+                # the following keys can be the "feature_keys" or "label_key" for initializing downstream ML model
+                "conditions": conditions,
+                "procedures": procedures,
+                "drugs": drugs,
+                "label": mortality_label,
+            }
+        )
+    return samples
+
+
+# mimic3sample = mimic3base.set_task(task_fn=drug_recommendation_mimic3_fn) # use default task
+# train_ds, val_ds, test_ds = split_by_patient(mimic3sample, [0.8, 0.1, 0.1])
+```
+
+%% Cell type:code id:96d047ff-6412-462c-9294-a5ba2d3bdba2 tags:
+
+``` python
+re_dataset = dataset.set_task(readmission_pred_task)
+re_dataset.stat()
+```
+
+%% Cell type:code id:70586b66-a814-4898-892b-d1bdd339ce7b tags:
+
+``` python
+mor_dataset = dataset.set_task(mortality_pred_task)
+mor_dataset.stat()
 ```

 %% Cell type:code id:1056b60a-5861-4e47-b694-891053cc8470 tags:

 ``` python
+# create dataloaders (torch.data.DataLoader)
+train_loader = get_dataloader(train_ds, batch_size=32, shuffle=True)
+val_loader = get_dataloader(val_ds, batch_size=32, shuffle=False)
+test_loader = get_dataloader(test_ds, batch_size=32, shuffle=False)
+```
+
+%% Cell type:markdown id:0f9b59b9-4bdb-44c4-b155-0a2a2d0f8d07 tags:
+
+### Embed MIMIC III Data using BERT
+
+%% Cell type:code id:1f5e0b1b-d75c-42ee-8d98-879f14b04606 tags:
+
+``` python
 ```