From fe1bb1d7b1966f199d23c03541d6ad9436966a7f Mon Sep 17 00:00:00 2001 From: aBotel <admbotel@uwaterloo.ca> Date: Tue, 11 Apr 2023 02:46:56 -0700 Subject: [PATCH] Add install script for Py deps. Add skeleton .ipynb that loads BERT. --- README.md | 2 + install.sh | 11 ++ src/cs598_dlh_final_project.ipynb | 271 ++++++++++++++++++++++++++++++ 3 files changed, 284 insertions(+) create mode 100755 install.sh create mode 100644 src/cs598_dlh_final_project.ipynb diff --git a/README.md b/README.md index 07d6ef0..f8927c9 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ To make it easy for you to get started with GitLab, here's a list of recommended Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! +Colab info: https://answers.illinois.edu/illinois/122558 + ## Add your files - [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..e154b15 --- /dev/null +++ b/install.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Maybe uncomment this to activate your conda env. +# conda activate <your_conda_env> + +# Make sure the BERT model is available in PyTorch +#conda config --add channels conda-forge +#conda install --yes -c conda-forge pytorch-pretrained-bert + +pip install pytorch-transformers +pip install pyhealth diff --git a/src/cs598_dlh_final_project.ipynb b/src/cs598_dlh_final_project.ipynb new file mode 100644 index 0000000..c4d5a68 --- /dev/null +++ b/src/cs598_dlh_final_project.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3225717c-458b-45cb-9e8b-89ecd03b9d07", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "7ccba6af-1867-491d-b9d0-d99965738c64", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5da368cd-3045-4ec0-86fb-2ce15b5d1b92", + "metadata": {}, + "outputs": [], + "source": [ + "# General includes.\n", + "import os\n", + "\n", + "# Numerical includes.\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "from torch import nn\n", + "\n", + "# Pyhealth includes.\n", + "from pyhealth.datasets import MIMIC3Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "95e714b1-ca5d-4d7b-9ace-6b4372adfd9f", + "metadata": {}, + "outputs": [], + "source": [ + "# Model imports \n", + "from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM" + ] + }, + { + "cell_type": "markdown", + "id": "cf83fa50-9dd0-467b-bd36-634795e4a09c", + "metadata": {}, + "source": [ + "# Globals" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ebef3528-0396-459a-8112-b272089f5d67", + "metadata": {}, + "outputs": [], + "source": [ + "USE_GPU_ = False" + ] + }, + { + "cell_type": "markdown", + "id": "df0f8271-44b2-44a2-b7cc-cd7339a70e87", + "metadata": {}, + "source": [ + "## Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02624f53-6cee-4db0-8b15-e08b755a04f2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "ebe2c2d9-c161-4b9d-8182-86384faea929", + "metadata": {}, + "source": [ + "### Load BERT Model" + ] + }, + { + "cell_type": "markdown", + "id": "39b0edec-1570-4640-9926-d908b456f957", + "metadata": {}, + "source": [ + "See instructions here: \n", + "- https://pypi.org/project/pytorch-pretrained-bert/#examples\n", + "- https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial\n", + "- Could also get it from pytorch transformers library: https://pytorch.org/hub/huggingface_pytorch-transformers/" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f40665f4-db92-4567-b0f9-001bd35e9284", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 877100.29B/s]\n" + ] + } + ], + "source": [ + "\n", + "# Load pre-trained model tokenizer (vocabulary)\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "\n", + "\n", + "# TODO(botelho3)`bert-base-uncased` is big. Load `bert-tiny` instead from the filesystem?\n", + "# Model available at https://huggingface.co/prajjwal1/bert-tiny.\n", + "# model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)\n", + "\n", + "# Tokenized input\n", + "text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n", + "tokenized_text = tokenizer.tokenize(text)\n", + "\n", + "# Mask a token that we will try to predict back with `BertForMaskedLM`\n", + "masked_index = 8\n", + "tokenized_text[masked_index] = '[MASK]'\n", + "assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']\n", + "\n", + "# Convert token to vocabulary indices\n", + "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n", + "# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)\n", + "segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]\n", + "\n", + "# Convert inputs to PyTorch tensors\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "segments_tensors = torch.tensor([segments_ids])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7539b72-a871-4d52-947b-075b5c4be2ae", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "de132943-2c3e-405b-b2f8-5d348472672c", + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "tuple indices must be integers or slices, not tuple", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/bw/pyw_1xcj0f302h0krt1_f5lm0000gn/T/ipykernel_23031/2203343201.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;31m# confirm we were able to predict 'henson'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mpredicted_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredictions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasked_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0mpredicted_token\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconvert_ids_to_tokens\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpredicted_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mpredicted_token\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'henson'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: tuple indices must be integers or slices, not tuple" + ] + } + ], + "source": [ + "# Load pre-trained model (weights)\n", + "model = BertForMaskedLM.from_pretrained('bert-base-uncased')\n", + "model.eval()\n", + "\n", + "# If you have a GPU, put everything on cuda\n", + "if USE_GPU_:\n", + " tokens_tensor = tokens_tensor.to('cuda')\n", + " segments_tensors = segments_tensors.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "# Predict all tokens\n", + "with torch.no_grad():\n", + " predictions = model(tokens_tensor, segments_tensors)\n", + "\n", + "# confirm we were able to predict 'henson'\n", + "predicted_index = torch.argmax(predictions[0, masked_index]).item()\n", + "predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n", + "assert predicted_token == 'henson'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02efb18a-626d-42d4-b19d-ba4b8f752a00", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e2d58de-bf9a-4b57-977f-6aa3ac75e84f", + "metadata": {}, + "outputs": [], + "source": [ + "# https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial\n", + "class BERTClassification(nn.Module):\n", + " def __init__ (self):\n", + " super(BERTClassification, self).__init__()\n", + " self.bert = BertModel.from_pretrained('bert-base-cased')\n", + " self.bert_drop = nn.Dropout(0.4)\n", + " self.out = nn.Linear(768, 1)\n", + "\n", + " def forward(self, ids, mask, token_type_ids):\n", + " _, pooledOut = self.bert(ids, attention_mask = mask,\n", + " token_type_ids=token_type_ids)\n", + " bertOut = self.bert_drop(pooledOut)\n", + " output = self.out(bertOut)\n", + "\n", + " return output" + ] + }, + { + "cell_type": "markdown", + "id": "8f0d5fa9-226e-4612-b28f-ed24de16399a", + "metadata": {}, + "source": [ + "### Load " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "427a42a6-441e-438c-96f4-b38ba82fd192", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1056b60a-5861-4e47-b694-891053cc8470", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab