AlchemistZoro
diff --git a/‎.gitignore‎
Lines changed: 146 additions & 0 deletions b/‎.gitignore‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 143 additions & 0 deletions b/‎README.md‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎create_enviroment.sh‎
Lines changed: 17 additions & 0 deletions b/‎create_enviroment.sh‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎create_enviroment_3090.sh‎
Lines changed: 17 additions & 0 deletions b/‎create_enviroment_3090.sh‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎download_raw_data.sh‎
Lines changed: 35 additions & 0 deletions b/‎download_raw_data.sh‎
Lines changed: 35 additions & 0 deletions
@@ -0,0 +1,146 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Vscode file
+.vscode/
+
+# GreaseLM project specific
+data/
+data_download/
+logs/
+runs/
+*.zip
+wandb/
+checkpoint/
+log_useful/
+
+# GreaseLM running generate
+filtered_concept.txt
+matcher_res.json
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Xikun Zhang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,143 @@
+# GreaseLM: Graph REASoning Enhanced Language Models for Question Answering
+
+This repo provides the source code & data of our paper [GreaseLM: Graph REASoning Enhanced Language Models for Question Answering](https://arxiv.org/abs/2201.08860) (ICLR 2022 spotlight). If you use any of our code, processed data or pretrained models, please cite:
+```bib
+@inproceedings{zhang2021greaselm,
+  title={GreaseLM: Graph REASoning Enhanced Language Models},
+  author={Zhang, Xikun and Bosselut, Antoine and Yasunaga, Michihiro and Ren, Hongyu and Liang, Percy and Manning, Christopher D and Leskovec, Jure},
+  booktitle={International Conference on Learning Representations},
+  year={2021}
+}
+```
+
+<p align="center">
+  <img src="./figs/greaselm.png" width="600" title="GreaseLM model architecture" alt="">
+</p>
+
+## 1. Dependencies
+
+- [Python](<https://www.python.org/>) == 3.8
+- [PyTorch](<https://pytorch.org/get-started/locally/>) == 1.8.0
+- [transformers](<https://github.com/huggingface/transformers/tree/v3.4.0>) == 3.4.0
+- [torch-geometric](https://pytorch-geometric.readthedocs.io/) == 1.7.0
+
+Run the following commands to create a conda environment (assuming CUDA 10.1):
+```bash
+conda create -y -n greaselm python=3.8
+conda activate greaselm
+pip install numpy==1.18.3 tqdm
+pip install torch==1.8.0+cu101 torchvision -f https://download.pytorch.org/whl/torch_stable.html
+pip install transformers==3.4.0 nltk spacy
+pip install wandb
+conda install -y -c conda-forge tensorboardx
+conda install -y -c conda-forge tensorboard
+
+# for torch-geometric
+pip install torch-scatter==2.0.7 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+pip install torch-cluster==1.5.9 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+pip install torch-sparse==0.6.9 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+pip install torch-spline-conv==1.2.1 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+pip install torch-geometric==1.7.0 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+```
+
+
+## 2. Download data
+
+### Download and preprocess data yourself
+**Preprocessing the data yourself may take long, so if you want to directly download preprocessed data, please jump to the next subsection.**
+
+Download the raw ConceptNet, CommonsenseQA, OpenBookQA data by using
+```
+./download_raw_data.sh
+```
+
+You can preprocess these raw data by running
+```
+CUDA_VISIBLE_DEVICES=0 python preprocess.py -p <num_processes>
+```
+You can specify the GPU you want to use in the beginning of the command `CUDA_VISIBLE_DEVICES=...`. The script will:
+* Setup ConceptNet (e.g., extract English relations from ConceptNet, merge the original 42 relation types into 17 types)
+* Convert the QA datasets into .jsonl files (e.g., stored in `data/csqa/statement/`)
+* Identify all mentioned concepts in the questions and answers
+* Extract subgraphs for each q-a pair
+
+The script to download and preprocess the [MedQA-USMLE](https://github.com/jind11/MedQA) data and the biomedical knowledge graph based on Disease Database and DrugBank is provided in `utils_biomed/`.
+
+### Directly download preprocessed data
+For your convenience, if you don't want to preprocess the data yourself, you can download all the preprocessed data [here](https://drive.google.com/drive/folders/1T6B4nou5P3u-6jr0z6e3IkitO8fNVM6f?usp=sharing). Download them into the top-level directory of this repo and unzip them. Move the `medqa_usmle` and `ddb` folders into the `data/` directory.
+
+### Resulting file structure
+
+The resulting file structure should look like this:
+
+```plain
+.
+├── README.md
+├── data/
+    ├── cpnet/                 (prerocessed ConceptNet)
+    ├── csqa/
+        ├── train_rand_split.jsonl
+        ├── dev_rand_split.jsonl
+        ├── test_rand_split_no_answers.jsonl
+        ├── statement/             (converted statements)
+        ├── grounded/              (grounded entities)
+        ├── graphs/                (extracted subgraphs)
+        ├── ...
+    ├── obqa/
+    ├── medqa_usmle/
+    └── ddb/
+```
+
+## 3. Training GreaseLM
+To train GreaseLM on CommonsenseQA, run
+```
+CUDA_VISIBLE_DEVICES=0 ./run_greaselm.sh csqa --data_dir data/
+```
+You can specify up to 2 GPUs you want to use in the beginning of the command `CUDA_VISIBLE_DEVICES=...`.
+
+Similarly, to train GreaseLM on OpenbookQA, run
+```
+CUDA_VISIBLE_DEVICES=0 ./run_greaselm.sh obqa --data_dir data/
+```
+
+To train GreaseLM on MedQA-USMLE, run
+```
+CUDA_VISIBLE_DEVICES=0 ./run_greaselm__medqa_usmle.sh
+```
+
+## 4. Pretrained model checkpoints
+You can download a pretrained GreaseLM model on CommonsenseQA [here](https://drive.google.com/file/d/1QPwLZFA6AQ-pFfDR6TWLdBAvm3c_HOUr/view?usp=sharing), which achieves an IH-dev acc. of `79.0` and an IH-test acc. of `74.0`.
+
+You can also download a pretrained GreaseLM model on OpenbookQA [here](https://drive.google.com/file/d/1-QqyiQuU9xlN20vwfIaqYQ_uJMP8d7Pv/view?usp=sharing), which achieves an test acc. of `84.8`.
+
+You can also download a pretrained GreaseLM model on MedQA-USMLE [here](https://drive.google.com/file/d/1j0QxiBiGbv0s9PhseSly6V6uiHWU5IEt/view?usp=sharing), which achieves an test acc. of `38.5`.
+
+## 5. Evaluating a pretrained model checkpoint
+To evaluate a pretrained GreaseLM model checkpoint on CommonsenseQA, run
+```
+CUDA_VISIBLE_DEVICES=0 ./eval_greaselm.sh csqa --data_dir data/ --load_model_path /path/to/checkpoint
+```
+Again you can specify up to 2 GPUs you want to use in the beginning of the command `CUDA_VISIBLE_DEVICES=...`.
+
+Similarly, to evaluate a pretrained GreaseLM model checkpoint on OpenbookQA, run
+```
+CUDA_VISIBLE_DEVICES=0 ./eval_greaselm.sh obqa --data_dir data/ --load_model_path /path/to/checkpoint
+```
+To evaluate a pretrained GreaseLM model checkpoint on MedQA-USMLE, run
+```
+INHERIT_BERT=1 CUDA_VISIBLE_DEVICES=0 ./eval_greaselm.sh medqa_usmle --data_dir data/ --load_model_path /path/to/checkpoint
+```
+
+## 6. Use your own dataset
+- Convert your dataset to  `{train,dev,test}.statement.jsonl`  in .jsonl format (see `data/csqa/statement/train.statement.jsonl`)
+- Create a directory in `data/{yourdataset}/` to store the .jsonl files
+- Modify `preprocess.py` and perform subgraph extraction for your data
+- Modify `utils/parser_utils.py` to support your own dataset
+
+## 7. Acknowledgment
+This repo is built upon the following work:
+```
+QA-GNN: Question Answering using Language Models and Knowledge Graphs
+https://github.com/michiyasunaga/qagnn
+```
+Many thanks to the authors and developers!
@@ -0,0 +1,17 @@
+conda create -y -n greaselm python=3.8
+conda activate greaselm
+pip install numpy==1.18.3 tqdm
+pip install torch==1.8.0+cu101 torchvision -f https://download.pytorch.org/whl/torch_stable.html
+pip install transformers==3.4.0 nltk spacy
+pip install wandb
+conda install -y -c conda-forge tensorboardx
+conda install -y -c conda-forge tensorboard
+
+# for torch-geometric
+pip install torch-scatter==2.0.7 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+pip install torch-cluster==1.5.9 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+pip install torch-sparse==0.6.9 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+pip install torch-spline-conv==1.2.1 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+pip install torch-geometric==1.7.0 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
+
+
@@ -0,0 +1,17 @@
+# create enciroment for RTX3090
+# CUDA version >= 11.1, torch version >=1.7.0
+conda create -y -n glm python=3.8
+conda activate glm
+
+# should use conda to install pytorch, use pip will get the OSError
+conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch
+
+
+# install torch-geometric from officiall doc
+# https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html
+pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
+
+pip install scipy transformers==3.4.0 tensorboardx nltk spacy networkx wandb
+
+
+
@@ -0,0 +1,35 @@
+# download ConceptNet
+mkdir -p data/
+mkdir -p data/cpnet/
+wget -nc -P data/cpnet/ https://s3.amazonaws.com/conceptnet/downloads/2018/edges/conceptnet-assertions-5.6.0.csv.gz
+cd data/cpnet/
+yes n | gzip -d conceptnet-assertions-5.6.0.csv.gz
+# download ConceptNet entity embedding
+wget https://csr.s3-us-west-1.amazonaws.com/tzw.ent.npy
+cd ../../
+
+
+
+
+# download CommensenseQA dataset
+mkdir -p data/csqa/
+wget -nc -P data/csqa/ https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl
+wget -nc -P data/csqa/ https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl
+wget -nc -P data/csqa/ https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl
+
+# create output folders
+mkdir -p data/csqa/grounded/
+mkdir -p data/csqa/graph/
+mkdir -p data/csqa/statement/
+
+
+
+# download OpenBookQA dataset
+wget -nc -P data/obqa/ https://s3-us-west-2.amazonaws.com/ai2-website/data/OpenBookQA-V1-Sep2018.zip
+yes n | unzip data/obqa/OpenBookQA-V1-Sep2018.zip -d data/obqa/
+
+# create output folders
+mkdir -p data/obqa/fairseq/official/
+mkdir -p data/obqa/grounded/
+mkdir -p data/obqa/graph/
+mkdir -p data/obqa/statement/