Add install script for Py deps. Add skeleton .ipynb that loads BERT.

fe1bb1d7 · aBotel · 0e7f1962 · fe1bb1d7 · fe1bb1d7 · fe1bb1d7
Commit fe1bb1d7 authored 2 years ago by aBotel
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ To make it easy for you to get started with GitLab, here's a list of recommended

 Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!

+Colab info: https://answers.illinois.edu/illinois/122558
+
 ## Add your files

 - [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files

--- a/install.sh
+++ b/install.sh
+#!/bin/bash
+
+# Maybe uncomment this to activate your conda env.
+# conda activate <your_conda_env> 
+
+# Make sure the BERT model is available in PyTorch
+#conda config --add channels conda-forge
+#conda install --yes -c conda-forge pytorch-pretrained-bert
+
+pip install pytorch-transformers
+pip install pyhealth
--- a/src/cs598_dlh_final_project.ipynb
+++ b/src/cs598_dlh_final_project.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3225717c-458b-45cb-9e8b-89ecd03b9d07",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ccba6af-1867-491d-b9d0-d99965738c64",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5da368cd-3045-4ec0-86fb-2ce15b5d1b92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# General includes.\n",
+    "import os\n",
+    "\n",
+    "# Numerical includes.\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "\n",
+    "# Pyhealth includes.\n",
+    "from pyhealth.datasets import MIMIC3Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "95e714b1-ca5d-4d7b-9ace-6b4372adfd9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model imports \n",
+    "from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf83fa50-9dd0-467b-bd36-634795e4a09c",
+   "metadata": {},
+   "source": [
+    "# Globals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ebef3528-0396-459a-8112-b272089f5d67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "USE_GPU_ = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df0f8271-44b2-44a2-b7cc-cd7339a70e87",
+   "metadata": {},
+   "source": [
+    "## Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02624f53-6cee-4db0-8b15-e08b755a04f2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebe2c2d9-c161-4b9d-8182-86384faea929",
+   "metadata": {},
+   "source": [
+    "### Load BERT Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39b0edec-1570-4640-9926-d908b456f957",
+   "metadata": {},
+   "source": [
+    "See instructions here: \n",
+    "- https://pypi.org/project/pytorch-pretrained-bert/#examples\n",
+    "- https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial\n",
+    "- Could also get it from pytorch transformers library: https://pytorch.org/hub/huggingface_pytorch-transformers/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f40665f4-db92-4567-b0f9-001bd35e9284",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 877100.29B/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# Load pre-trained model tokenizer (vocabulary)\n",
+    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "\n",
+    "# TODO(botelho3)`bert-base-uncased` is big. Load `bert-tiny` instead from the filesystem?\n",
+    "# Model available at https://huggingface.co/prajjwal1/bert-tiny.\n",
+    "# model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)\n",
+    "\n",
+    "# Tokenized input\n",
+    "text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n",
+    "tokenized_text = tokenizer.tokenize(text)\n",
+    "\n",
+    "# Mask a token that we will try to predict back with `BertForMaskedLM`\n",
+    "masked_index = 8\n",
+    "tokenized_text[masked_index] = '[MASK]'\n",
+    "assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']\n",
+    "\n",
+    "# Convert token to vocabulary indices\n",
+    "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
+    "# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)\n",
+    "segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]\n",
+    "\n",
+    "# Convert inputs to PyTorch tensors\n",
+    "tokens_tensor = torch.tensor([indexed_tokens])\n",
+    "segments_tensors = torch.tensor([segments_ids])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7539b72-a871-4d52-947b-075b5c4be2ae",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "de132943-2c3e-405b-b2f8-5d348472672c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "tuple indices must be integers or slices, not tuple",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m/var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_23031/2203343201.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0;31m# confirm we were able to predict 'henson'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mpredicted_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasked_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     17\u001b[0m \u001b[0mpredicted_token\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconvert_ids_to_tokens\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpredicted_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mpredicted_token\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'henson'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: tuple indices must be integers or slices, not tuple"
+     ]
+    }
+   ],
+   "source": [
+    "# Load pre-trained model (weights)\n",
+    "model = BertForMaskedLM.from_pretrained('bert-base-uncased')\n",
+    "model.eval()\n",
+    "\n",
+    "# If you have a GPU, put everything on cuda\n",
+    "if USE_GPU_:\n",
+    "    tokens_tensor = tokens_tensor.to('cuda')\n",
+    "    segments_tensors = segments_tensors.to('cuda')\n",
+    "    model.to('cuda')\n",
+    "\n",
+    "# Predict all tokens\n",
+    "with torch.no_grad():\n",
+    "    predictions = model(tokens_tensor, segments_tensors)\n",
+    "\n",
+    "# confirm we were able to predict 'henson'\n",
+    "predicted_index = torch.argmax(predictions[0, masked_index]).item()\n",
+    "predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n",
+    "assert predicted_token == 'henson'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02efb18a-626d-42d4-b19d-ba4b8f752a00",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e2d58de-bf9a-4b57-977f-6aa3ac75e84f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial\n",
+    "class BERTClassification(nn.Module):\n",
+    "    def __init__ (self):\n",
+    "        super(BERTClassification, self).__init__()\n",
+    "        self.bert = BertModel.from_pretrained('bert-base-cased')\n",
+    "        self.bert_drop = nn.Dropout(0.4)\n",
+    "        self.out = nn.Linear(768, 1)\n",
+    "\n",
+    "    def forward(self, ids, mask, token_type_ids):\n",
+    "        _, pooledOut = self.bert(ids, attention_mask = mask,\n",
+    "                                token_type_ids=token_type_ids)\n",
+    "        bertOut = self.bert_drop(pooledOut)\n",
+    "        output = self.out(bertOut)\n",
+    "\n",
+    "        return output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f0d5fa9-226e-4612-b28f-ed24de16399a",
+   "metadata": {},
+   "source": [
+    "### Load "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "427a42a6-441e-438c-96f4-b38ba82fd192",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1056b60a-5861-4e47-b694-891053cc8470",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:3225717c-458b-45cb-9e8b-89ecd03b9d07 tags:
+
+``` python
+```
+
+%% Cell type:markdown id:7ccba6af-1867-491d-b9d0-d99965738c64 tags:
+
+# Imports
+
+%% Cell type:code id:5da368cd-3045-4ec0-86fb-2ce15b5d1b92 tags:
+
+``` python
+# General includes.
+import os
+
+# Numerical includes.
+import numpy as np
+import pandas as pd
+import torch
+from torch import nn
+
+# Pyhealth includes.
+from pyhealth.datasets import MIMIC3Dataset
+```
+
+%% Cell type:code id:95e714b1-ca5d-4d7b-9ace-6b4372adfd9f tags:
+
+``` python
+# Model imports
+from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
+```
+
+%% Cell type:markdown id:cf83fa50-9dd0-467b-bd36-634795e4a09c tags:
+
+# Globals
+
+%% Cell type:code id:ebef3528-0396-459a-8112-b272089f5d67 tags:
+
+``` python
+USE_GPU_ = False
+```
+
+%% Cell type:markdown id:df0f8271-44b2-44a2-b7cc-cd7339a70e87 tags:
+
+## Preprocessing
+
+%% Cell type:code id:02624f53-6cee-4db0-8b15-e08b755a04f2 tags:
+
+``` python
+```
+
+%% Cell type:markdown id:ebe2c2d9-c161-4b9d-8182-86384faea929 tags:
+
+### Load BERT Model
+
+%% Cell type:markdown id:39b0edec-1570-4640-9926-d908b456f957 tags:
+
+See instructions here:
+- https://pypi.org/project/pytorch-pretrained-bert/#examples
+- https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
+- Could also get it from pytorch transformers library: https://pytorch.org/hub/huggingface_pytorch-transformers/
+
+%% Cell type:code id:f40665f4-db92-4567-b0f9-001bd35e9284 tags:
+
+``` python
+
+# Load pre-trained model tokenizer (vocabulary)
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+
+# TODO(botelho3)`bert-base-uncased` is big. Load `bert-tiny` instead from the filesystem?
+# Model available at https://huggingface.co/prajjwal1/bert-tiny.
+# model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)
+
+# Tokenized input
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = tokenizer.tokenize(text)
+
+# Mask a token that we will try to predict back with `BertForMaskedLM`
+masked_index = 8
+tokenized_text[masked_index] = '[MASK]'
+assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
+
+# Convert token to vocabulary indices
+indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# Convert inputs to PyTorch tensors
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+```
+
+%% Output
+
+    100%|█████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 877100.29B/s]
+
+%% Cell type:code id:a7539b72-a871-4d52-947b-075b5c4be2ae tags:
+
+``` python
+```
+
+%% Cell type:code id:de132943-2c3e-405b-b2f8-5d348472672c tags:
+
+``` python
+# Load pre-trained model (weights)
+model = BertForMaskedLM.from_pretrained('bert-base-uncased')
+model.eval()
+
+# If you have a GPU, put everything on cuda
+if USE_GPU_:
+    tokens_tensor = tokens_tensor.to('cuda')
+    segments_tensors = segments_tensors.to('cuda')
+    model.to('cuda')
+
+# Predict all tokens
+with torch.no_grad():
+    predictions = model(tokens_tensor, segments_tensors)
+
+# confirm we were able to predict 'henson'
+predicted_index = torch.argmax(predictions[0, masked_index]).item()
+predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+assert predicted_token == 'henson'
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    TypeError                                 Traceback (most recent call last)
+    /var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_23031/2203343201.py in <module>
+         14
+         15 # confirm we were able to predict 'henson'
+    ---> 16 predicted_index = torch.argmax(predictions[0, masked_index]).item()
+         17 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+         18 assert predicted_token == 'henson'
+    TypeError: tuple indices must be integers or slices, not tuple
+
+%% Cell type:code id:02efb18a-626d-42d4-b19d-ba4b8f752a00 tags:
+
+``` python
+```
+
+%% Cell type:code id:0e2d58de-bf9a-4b57-977f-6aa3ac75e84f tags:
+
+``` python
+# https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
+class BERTClassification(nn.Module):
+    def __init__ (self):
+        super(BERTClassification, self).__init__()
+        self.bert = BertModel.from_pretrained('bert-base-cased')
+        self.bert_drop = nn.Dropout(0.4)
+        self.out = nn.Linear(768, 1)
+
+    def forward(self, ids, mask, token_type_ids):
+        _, pooledOut = self.bert(ids, attention_mask = mask,
+                                token_type_ids=token_type_ids)
+        bertOut = self.bert_drop(pooledOut)
+        output = self.out(bertOut)
+
+        return output
+```
+
+%% Cell type:markdown id:8f0d5fa9-226e-4612-b28f-ed24de16399a tags:
+
+### Load
+
+%% Cell type:code id:427a42a6-441e-438c-96f4-b38ba82fd192 tags:
+
+``` python
+```
+
+%% Cell type:code id:1056b60a-5861-4e47-b694-891053cc8470 tags:
+
+``` python
+```