From fe1bb1d7b1966f199d23c03541d6ad9436966a7f Mon Sep 17 00:00:00 2001
From: aBotel <admbotel@uwaterloo.ca>
Date: Tue, 11 Apr 2023 02:46:56 -0700
Subject: [PATCH] Add install script for Py deps. Add skeleton .ipynb that
 loads BERT.

---
 README.md                         |   2 +
 install.sh                        |  11 ++
 src/cs598_dlh_final_project.ipynb | 271 ++++++++++++++++++++++++++++++
 3 files changed, 284 insertions(+)
 create mode 100755 install.sh
 create mode 100644 src/cs598_dlh_final_project.ipynb

diff --git a/README.md b/README.md
index 07d6ef0..f8927c9 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ To make it easy for you to get started with GitLab, here's a list of recommended
 
 Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
 
+Colab info: https://answers.illinois.edu/illinois/122558
+
 ## Add your files
 
 - [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..e154b15
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Maybe uncomment this to activate your conda env.
+# conda activate <your_conda_env> 
+
+# Make sure the BERT model is available in PyTorch
+#conda config --add channels conda-forge
+#conda install --yes -c conda-forge pytorch-pretrained-bert
+
+pip install pytorch-transformers
+pip install pyhealth
diff --git a/src/cs598_dlh_final_project.ipynb b/src/cs598_dlh_final_project.ipynb
new file mode 100644
index 0000000..c4d5a68
--- /dev/null
+++ b/src/cs598_dlh_final_project.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3225717c-458b-45cb-9e8b-89ecd03b9d07",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ccba6af-1867-491d-b9d0-d99965738c64",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5da368cd-3045-4ec0-86fb-2ce15b5d1b92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# General includes.\n",
+    "import os\n",
+    "\n",
+    "# Numerical includes.\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "\n",
+    "# Pyhealth includes.\n",
+    "from pyhealth.datasets import MIMIC3Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "95e714b1-ca5d-4d7b-9ace-6b4372adfd9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model imports \n",
+    "from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf83fa50-9dd0-467b-bd36-634795e4a09c",
+   "metadata": {},
+   "source": [
+    "# Globals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ebef3528-0396-459a-8112-b272089f5d67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "USE_GPU_ = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df0f8271-44b2-44a2-b7cc-cd7339a70e87",
+   "metadata": {},
+   "source": [
+    "## Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02624f53-6cee-4db0-8b15-e08b755a04f2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebe2c2d9-c161-4b9d-8182-86384faea929",
+   "metadata": {},
+   "source": [
+    "### Load BERT Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39b0edec-1570-4640-9926-d908b456f957",
+   "metadata": {},
+   "source": [
+    "See instructions here: \n",
+    "- https://pypi.org/project/pytorch-pretrained-bert/#examples\n",
+    "- https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial\n",
+    "- Could also get it from pytorch transformers library: https://pytorch.org/hub/huggingface_pytorch-transformers/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f40665f4-db92-4567-b0f9-001bd35e9284",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 231508/231508 [00:00<00:00, 877100.29B/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# Load pre-trained model tokenizer (vocabulary)\n",
+    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "\n",
+    "# TODO(botelho3)`bert-base-uncased` is big. Load `bert-tiny` instead from the filesystem?\n",
+    "# Model available at https://huggingface.co/prajjwal1/bert-tiny.\n",
+    "# model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)\n",
+    "\n",
+    "# Tokenized input\n",
+    "text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n",
+    "tokenized_text = tokenizer.tokenize(text)\n",
+    "\n",
+    "# Mask a token that we will try to predict back with `BertForMaskedLM`\n",
+    "masked_index = 8\n",
+    "tokenized_text[masked_index] = '[MASK]'\n",
+    "assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']\n",
+    "\n",
+    "# Convert token to vocabulary indices\n",
+    "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
+    "# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)\n",
+    "segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]\n",
+    "\n",
+    "# Convert inputs to PyTorch tensors\n",
+    "tokens_tensor = torch.tensor([indexed_tokens])\n",
+    "segments_tensors = torch.tensor([segments_ids])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7539b72-a871-4d52-947b-075b5c4be2ae",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "de132943-2c3e-405b-b2f8-5d348472672c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "tuple indices must be integers or slices, not tuple",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m/var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_23031/2203343201.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0;31m# confirm we were able to predict 'henson'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mpredicted_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasked_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     17\u001b[0m \u001b[0mpredicted_token\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconvert_ids_to_tokens\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpredicted_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mpredicted_token\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'henson'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: tuple indices must be integers or slices, not tuple"
+     ]
+    }
+   ],
+   "source": [
+    "# Load pre-trained model (weights)\n",
+    "model = BertForMaskedLM.from_pretrained('bert-base-uncased')\n",
+    "model.eval()\n",
+    "\n",
+    "# If you have a GPU, put everything on cuda\n",
+    "if USE_GPU_:\n",
+    "    tokens_tensor = tokens_tensor.to('cuda')\n",
+    "    segments_tensors = segments_tensors.to('cuda')\n",
+    "    model.to('cuda')\n",
+    "\n",
+    "# Predict all tokens\n",
+    "with torch.no_grad():\n",
+    "    predictions = model(tokens_tensor, segments_tensors)\n",
+    "\n",
+    "# confirm we were able to predict 'henson'\n",
+    "predicted_index = torch.argmax(predictions[0, masked_index]).item()\n",
+    "predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n",
+    "assert predicted_token == 'henson'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02efb18a-626d-42d4-b19d-ba4b8f752a00",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e2d58de-bf9a-4b57-977f-6aa3ac75e84f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial\n",
+    "class BERTClassification(nn.Module):\n",
+    "    def __init__ (self):\n",
+    "        super(BERTClassification, self).__init__()\n",
+    "        self.bert = BertModel.from_pretrained('bert-base-cased')\n",
+    "        self.bert_drop = nn.Dropout(0.4)\n",
+    "        self.out = nn.Linear(768, 1)\n",
+    "\n",
+    "    def forward(self, ids, mask, token_type_ids):\n",
+    "        _, pooledOut = self.bert(ids, attention_mask = mask,\n",
+    "                                token_type_ids=token_type_ids)\n",
+    "        bertOut = self.bert_drop(pooledOut)\n",
+    "        output = self.out(bertOut)\n",
+    "\n",
+    "        return output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f0d5fa9-226e-4612-b28f-ed24de16399a",
+   "metadata": {},
+   "source": [
+    "### Load "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "427a42a6-441e-438c-96f4-b38ba82fd192",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1056b60a-5861-4e47-b694-891053cc8470",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab