From fa07edcc150eefcf3e507b2db9caee05dc5980ce Mon Sep 17 00:00:00 2001
From: Colin Wang <zw1300@princeton.edu>
Date: Wed, 12 Jun 2024 22:28:57 -0400
Subject: [PATCH] restructure codebase

---
 README.md                                     | 66 +++++++++----------
 run.sh                                        |  8 +--
 constants.py => src/constants.py              |  0
 .../descriptive_utils.py                      |  0
 evaluate.py => src/evaluate.py                |  0
 generate.py => src/generate.py                |  4 +-
 get_score.py => src/get_score.py              |  0
 reasoning_utils.py => src/reasoning_utils.py  |  0
 8 files changed, 38 insertions(+), 40 deletions(-)
 rename constants.py => src/constants.py (100%)
 rename descriptive_utils.py => src/descriptive_utils.py (100%)
 rename evaluate.py => src/evaluate.py (100%)
 rename generate.py => src/generate.py (96%)
 rename get_score.py => src/get_score.py (100%)
 rename reasoning_utils.py => src/reasoning_utils.py (100%)

diff --git a/README.md b/README.md
index b70717d..d29e2d3 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,8 @@ unzip images.zip && rm images.zip
 â”‚   â”œâ”€â”€ image_metadata_val.json
 â”‚   â”œâ”€â”€ reasoning_test.json
 â”‚   â”œâ”€â”€ reasoning_val.json
-â”‚   â””â”€â”€ README.md
+â”‚   â”œâ”€â”€ README.md
+â”‚   â””â”€â”€ LICENSE
 â”œâ”€â”€ images/
 â”‚   â”œâ”€â”€ 0.jpg
 â”‚   â”œâ”€â”€ ...
@@ -44,38 +45,36 @@ unzip images.zip && rm images.zip
 â”‚   â””â”€â”€ README.md
 â”œâ”€â”€ results/
 â”‚   â””â”€â”€ README.md
-â”œâ”€â”€ constants.py
-â”œâ”€â”€ descriptive_utils.py
-â”œâ”€â”€ reasoning_utils.py
-â”œâ”€â”€ evaluate.py
-â”œâ”€â”€ generate.py
-â”œâ”€â”€ get_score.py
+â”œâ”€â”€ src/
+â”‚   â”œâ”€â”€ constants.py
+â”‚   â”œâ”€â”€ descriptive_utils.py
+â”‚   â”œâ”€â”€ reasoning_utils.py
+â”‚   â”œâ”€â”€ evaluate.py
+â”‚   â”œâ”€â”€ generate.py
+â”‚   â””â”€â”€ get_score.py
 â”œâ”€â”€ run.sh
-â””â”€â”€ README.md
+â”œâ”€â”€ README.md
+â”œâ”€â”€ LICENSE
+â””â”€â”€ .gitignore
 ```
-`data` folder contains all QAs and metadata for images, descriptive questions, and reasoning questions. Answers for the test split are intentionally made to `null` to prevent testing data from leaking into the public.
-
-`images` folder contains all images where their identifiers range from 0 to 2399. Note that there are only 2333 images in total and the numberings are **not** consecutive.
-
-`results` folder contains all response generation and scoring results.
-
-`constants.py` stores all the prompts and mappings from question ids to actual questions.
-
-`descriptive_utils.py` contains all code to build queries for response generation and grading, as well as saving all artifacts for descriptive questions.
+* `data` folder contains all QAs and metadata for images, descriptive questions, and reasoning questions. Answers for the test split are intentionally made to `null` to prevent testing data from leaking into the public.  
+* `images` folder contains all images where their identifiers range from 0 to 2399. Note that there are only 2333 images in total and the numberings are **not** consecutive.  
+* `results` folder contains all response generation and scoring results.  
+* `src` folder contains all python code for CharXiv:  
+  * `constants.py` stores all the prompts and mappings from question ids to actual questions.  
+  * `descriptive_utils.py` contains all code to build queries for response generation and grading, as well as saving all artifacts for descriptive questions.  
+  * `reasoning_utils.py` contains all code to build queries for response generation and grading, as well as saving all artifacts for reasoning questions.  
+  * `evaluate.py` is the main function to evaluate model responses against the answer with gpt API calls.  
+  * `generate.py` is the main function to loop QAs for model to generate responses.  
+  * `get_score.py` is the main function to print the reasoning and descriptive question scores.
+* `run.sh` is the script to evaluate models
 
-`reasoning_utils.py` contains all code to build queries for response generation and grading, as well as saving all artifacts for reasoning questions.
-
-`evaluate.py` is the main function to evaluate model responses against the answer with gpt API calls.
-
-`generate.py` is the main function to loop QAs for model to generate responses.
-
-`get_score.py` is the main function to print the reasoning and descriptive question scores.
 
 </details>
 
 ### Response generation
 CharXiv doesn't require any third-party python library when prompting your models to generate responses to the chart-question pairs. Therefore, to set up your model, you should implement the `custom_evaluate` function in `generate.py`. Specifically, this function takes `queries` as the input, which contain all the charts and questions CharXiv uses to evaluate models. It has the following structure:
-```
+```js
 {
     figure_id:{
         'question': ...<str>
@@ -88,17 +87,14 @@ CharXiv doesn't require any third-party python library when prompting your model
     },
 }
 ```
-Once you load your models and all preprocessing functions, simply to the following:
-```
+Once you load your models and all preprocessing functions, simply implement the `evaluate` function in `src/generate.py`:
+```py
 for k in tqdm(queries):
-        query = queries[k]['question']
-        image = queries[k]["figure_path"]
-        ########## Your own code ##########
-        query, image = preprocess(query, image) # your own model's preprocessing functions such as adding additional information or processing images.
-        response = model.chat(query, image)
-        ###################################
-        # once your model generates the response, simply do this and you are all set!
-        queries[k]['response'] = response
+    query = queries[k]['question']
+    image = queries[k]["figure_path"]
+    query, image = preprocess(query, image) #TODO
+    response = model.chat(query, image) #TODO
+    queries[k]['response'] = response
 ```
 
 To generate model responses:
diff --git a/run.sh b/run.sh
index c848902..0cd5bdd 100644
--- a/run.sh
+++ b/run.sh
@@ -2,21 +2,21 @@ model_name=my_model # custom name for the model
 openai_key=my_key # OpenAI API key
 split=val # choose from val, test
 mode=reasoning # choose from reasoning, descriptive
-model_path=my_model_path # path to the model, customizable argument
+model_path="your_path" # path to the model, customizable argument
 
-python generate.py \
+python src/generate.py \
     --model_name $model_name \
     --split $split \
     --mode $mode \
     --model_path $model_path
 
-python evaluate.py \
+python src/evaluate.py \
     --model_name $model_name \
     --split $split \
     --mode $mode \
     --api_key $openai_key
 
-python get_score.py \
+python src/get_score.py \
     --model_name $model_name \
     --split $split \
     --mode $mode
diff --git a/constants.py b/src/constants.py
similarity index 100%
rename from constants.py
rename to src/constants.py
diff --git a/descriptive_utils.py b/src/descriptive_utils.py
similarity index 100%
rename from descriptive_utils.py
rename to src/descriptive_utils.py
diff --git a/evaluate.py b/src/evaluate.py
similarity index 100%
rename from evaluate.py
rename to src/evaluate.py
diff --git a/generate.py b/src/generate.py
similarity index 96%
rename from generate.py
rename to src/generate.py
index f5f22e3..8f1f5d7 100644
--- a/generate.py
+++ b/src/generate.py
@@ -75,7 +75,9 @@ if __name__ == '__main__':
     print("Evaluation mode:", args.mode)
     print("Output file:", output_file)
 
-    evaluate(queries) # switch to demo(queries, model_path) for evaluating the IXC2 4khd model
+    # switch to demo(queries, model_path) for IXC2 4khd model
+    demo(queries, model_path=args.model_path)
+    # evaluate(queries) 
 
     for k in queries:
         queries[k].pop("figure_path", None)
diff --git a/get_score.py b/src/get_score.py
similarity index 100%
rename from get_score.py
rename to src/get_score.py
diff --git a/reasoning_utils.py b/src/reasoning_utils.py
similarity index 100%
rename from reasoning_utils.py
rename to src/reasoning_utils.py
-- 
GitLab