Skip to content
Snippets Groups Projects
Unverified Commit 44e47cbb authored by Yushi Bai's avatar Yushi Bai Committed by GitHub
Browse files

Delete retrieval directory

parent 8bf3b3e6
No related branches found
No related tags found
No related merge requests found
Showing with 0 additions and 1295 deletions
#!/bin/bash
chunk_size=500
work_dir="../../LongBench" # dir for storing data
source_dir="${work_dir}/data" # source LongBench dir
dest_dir=""${work_dir}/B${chunk_size}/data""
file_names=()
allowed_files=("multifieldqa_en.jsonl" "qasper.jsonl" "2wikimqa.jsonl" "dureader.jsonl" "hotpotqa.jsonl" "narrativeqa.jsonl" "musique.jsonl" "multifieldqa_zh.jsonl")
# store all jsonl files
while IFS= read -r -d '' file; do
base_name=$(basename "$file")
# Check if the file name is in the allowed_files list
if [[ " ${allowed_files[@]} " =~ " ${base_name} " ]]; then
file_names+=("$base_name")
fi
done < <(find "$source_dir" -type f -name "*.jsonl" -print0)
# concurrent execution
group_size=3
for ((start=0; start<${#file_names[@]}; start+=group_size)); do
end=$((start + group_size - 1))
echo "Index Range:$start ~ $end"
current_group=("${file_names[@]:start:group_size}")
for file in "${current_group[@]}"; do
fileName=$(basename "${file}")
python generate_BM25.py \
--file_name $fileName \
--source_dir $source_dir \
--dest_dir $dest_dir \
--chunk_size $chunk_size \
&
done
wait
done
cp ../LongBench.py "${work_dir}/B${chunk_size}"
\ No newline at end of file
from rank_bm25 import BM25Okapi
import os
import json
from concurrent.futures import ThreadPoolExecutor, wait
from tqdm import tqdm
import argparse
import sys
sys.path.append('..')
from splitter import split_long_sentence, get_word_len, regex
# DEBUG
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
def retriveDoc(query: str, document: str, chunk_size, file_name:str,
js, output_list, idx, pbar=None, maxLen=1500):
# 1. Splits the context into pieces
texts = split_long_sentence(document, regex, chunk_size=chunk_size, filename=file_name)
# 2. Creates retriver, adds texts
retriever = BM25Okapi(texts)
# 3. Retrive and merge
retrieved_texts = retriever.get_top_n(query=query, documents=texts,
n=len(texts))
retrieved_texts = [retrieved_texts] if type(retrieved_texts) == str else retrieved_texts
context = ''
for text in retrieved_texts:
if get_word_len(context) < maxLen:
context += text
js['retrieved'] = retrieved_texts if type(retrieved_texts) == list else [retrieved_texts]
js['context'] = context
js['length'] = get_word_len(context)
output_list[index] = js
if pbar:
pbar.update()
return
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# parser.add_argument("--file_name", default='dureader.jsonl')
parser.add_argument("--file_name", default='2wikimqa.jsonl')
parser.add_argument("--source_dir", default='../../LongBench/data')
parser.add_argument("--dest_dir", default='./test')
parser.add_argument("--chunk_size", type=int, default=200)
args = parser.parse_args()
file_name = args.file_name
print(f"------ {file_name} ------")
with open(os.path.join(args.source_dir, file_name), 'r', encoding='utf-8') as file:
file_contents = file.readlines()
output_data = [{}] * len(file_contents)
if (os.path.exists(os.path.join(args.dest_dir, file_name))):
with open(os.path.join(args.dest_dir, file_name), 'r', encoding='utf-8') as f:
lines = f.readlines()
lines = [line for line in lines]
output_data = [json.loads(line) for line in lines]
loop = tqdm(enumerate(file_contents), total=len(file_contents), desc=f'{file_name}')
exe_list = []
with ThreadPoolExecutor(max_workers=10) as executor:
# for index, line in loop:
for index, line in enumerate(file_contents):
if (output_data[index] != {} or
"context" in output_data[index].keys() and len(output_data[index]['context']) != 0):
loop.update()
continue
line_js = json.loads(line)
retriveDoc(query=line_js['input'], document=line_js['context'],
chunk_size=args.chunk_size, file_name=file_name,
js=line_js, output_list=output_data, idx=index, pbar=loop)
# exe_list.append(executor.submit(retriveDoc, query=line_js['input'], document=line_js['context'],
# chunk_size=args.chunk_size, file_name=file_name,
# js=line_js, output_list=output_data, idx=index, pbar=loop))
# loop.set_description(f'{file_name}')
wait(exe_list)
# saving
os.makedirs(args.dest_dir, exist_ok=True)
with open(os.path.join(args.dest_dir, file_name), 'w', encoding='utf-8') as output_file:
for item in output_data:
output_file.write(json.dumps(item, ensure_ascii=False) + '\n')
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import datasets
import json
_DESCRIPTION = """\
LongBench is a comprehensive benchmark for multilingual and multi-task purposes, with the goal to fully measure and evaluate the ability of pre-trained language models to understand long text. This dataset consists of twenty different tasks, covering key long-text application scenarios such as multi-document QA, single-document QA, summarization, few-shot learning, synthetic tasks, and code completion.
"""
_HOMEPAGE = "https://github.com/THUDM/LongBench"
# _URL = r"https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip"
_URLS = {
"2wikimqa": "./data/2wikimqa.jsonl",
"dureader": "./data/dureader.jsonl",
"qasper": "./data/qasper.jsonl",
"hotpotqa": "./data/hotpotqa.jsonl",
"narrativeqa": "./data/narrativeqa.jsonl",
"musique": "./data/musique.jsonl",
"multifieldqa_zh":"./data/multifieldqa_zh.jsonl",
"multifieldqa_en":"./data/multifieldqa_en.jsonl",
}
task_list = [
"multifieldqa_en",
"qasper",
"2wikimqa",
"dureader",
"hotpotqa",
"narrativeqa",
"musique",
"multifieldqa_zh"
]
class LongBenchConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
super().__init__(version=datasets.Version("1.0.0"), **kwargs)
class LongBench(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
LongBenchConfig(
name=task_name,
)
for task_name in task_list
]
def _info(self):
features = datasets.Features(
{
"input": datasets.Value("string"),
"context": datasets.Value("string"),
"answers": [datasets.Value("string")],
"length": datasets.Value("int32"),
"dataset": datasets.Value("string"),
"language": datasets.Value("string"),
"all_classes": [datasets.Value("string")],
"retrieved": [datasets.Value("string")],
"_id": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
)
def _split_generators(self, dl_manager):
task_name = self.config.name
data_dir = dl_manager.download(_URLS[task_name])
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(
# data_dir, f"{task_name}.jsonl"
data_dir
),
},
)
]
def _generate_examples(self, filepath):
with open(filepath, encoding="utf-8") as f:
for idx, line in enumerate(f):
key = f"{self.config.name}-{idx}"
item = json.loads(line)
yield key, {
"input": item["input"],
"context": item["context"],
"answers": item["answers"],
"length": item["length"],
"dataset": item["dataset"],
"language": item["language"],
"retrieved": item["retrieved"],
"_id": item["_id"],
"all_classes": item["all_classes"],
}
\ No newline at end of file
## Introduction
This folder is to conduct retrieval-based context compression on LongBench using 3 retrievers.
- BM25
- [Contriever](https://github.com/facebookresearch/contriever)
- OpenAI Embedding ([text-embedding-ada-002](https://openai.com/blog/new-and-improved-embedding-model))
First, download the LongBench dataset from HuggingFace and save them in `../LongBench/`, resulting in the folder structure:
```
LongBench/
LongBench/
data/
Put raw LongBench data here.
2wikimqa.jsonl
...
retrieval/
BM25/
contriever/
contriever/: github
mcontriever/: huggingface
embedding/
README.md: This file.
```
## Usage
Install the requirements with pip: `pip install -r requirements.txt`
### Retrieval
We take contriever method as an example.
1. Clone contriever from https://github.com/facebookresearch/contriever
2. Replace the files in contriever directory with `contriever/passage_retrieval.py` and `contriever/generate_passage_embeddings.py`
3. Get mcontriever model from https://huggingface.co/facebook/mcontriever
4. run `mContriever.sh`
5. Each line within the JSONL file is expanded by adding a new item "retrieved", which represents the retrieval outcomes of the original context. These results are sorted according to the retriever's criteria.
### Evaluation
We take ChatGLM2-6B-32k as an example. First run [pred.py](pred.py):
```bash
python pred.py --model chatglm2-6b-32k --data C200 --top_k 7
```
Then evaluate via [eval.py](eval.py):
```bash
python eval.py --model chatglm2-6b-32k --data C200_7
```
Then the evaluation files are in `result_chatglm2-6b-32k`.
\ No newline at end of file
import os
import json
import pandas as pd
import argparse
import re
from tqdm import tqdm
import sys
sys.path.append('..')
from splitter import split_long_sentence, regex
import concurrent.futures
# DEBUG
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder", type=str, default='../source/docqa_only')
parser.add_argument("--chunk_size", type=int, default=200)
parser.add_argument("--output_folder", type=str, default='../datasets/C200_t/split')
args = parser.parse_args()
def process_jsonl_file(input_file, output_folder, chunk_size=100, filename='Unknown'):
with open(input_file, 'r', encoding='utf-8') as f_in:
lines = f_in.readlines()
# for idx, line in enumerate(lines):
loop = tqdm(lines, desc=filename)
for line in loop:
data = json.loads(line)
context = data.get('context', '')
chunks = split_long_sentence(context, regex, chunk_size, filename)
output_folder_name = os.path.join(output_folder, os.path.splitext(os.path.basename(input_file))[0])
if not os.path.exists(output_folder_name):
os.makedirs(output_folder_name)
output_data = []
for i, chunk in enumerate(chunks):
output_datum = {
'id': data['_id'] + '_' + str(i),
'text': chunk.strip(),
'title': ''
}
output_data.append(output_datum)
output_data = pd.DataFrame(output_data, index=range(len(output_data)))
output_tsv_file = os.path.join(output_folder_name, data['_id'] + '.tsv')
output_data.to_csv(output_tsv_file, sep='\t', index=False)
output_jsonl_file = os.path.join(output_folder_name, data['_id'] + '.jsonl')
output_data = {
'id': data['_id'],
# 'lang': 'zh' if "_zh" in input_file else 'en',
'lang' : 'zh' if 'zh' in data.get('context', '') else 'en',
'question': data.get('input', ''),
'answers': []
}
with open(output_jsonl_file, 'w', encoding='utf-8') as f_out:
f_out.write(json.dumps(output_data, ensure_ascii=False) + '\n')
def process_all_jsonl_files(input_folder, output_folder, chunk_size=1700):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
loop = tqdm(os.listdir(input_folder))
allowed_files = ["multifieldqa_en.jsonl", "qasper.jsonl", "2wikimqa.jsonl", "dureader.jsonl", "hotpotqa.jsonl", "narrativeqa.jsonl", "musique.jsonl", "multifieldqa_zh.jsonl"]
for filename in loop:
if filename.endswith('.jsonl') and filename in allowed_files:
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
input_file = os.path.join(input_folder, filename)
loop.set_description(f"totalFile")
# process_jsonl_file(input_file, output_folder, chunk_size, filename)
executor.submit(process_jsonl_file, input_file, output_folder, chunk_size, filename)
# print("split {} done!".format(filename))
process_all_jsonl_files(args.input_folder, args.output_folder, chunk_size=args.chunk_size)
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import os
import argparse
import csv
import logging
import pickle
import numpy as np
import torch
import transformers
import src.slurm
import src.contriever
import src.utils
import src.data
import src.normalize_text
def embed_passages(args, passages, model, tokenizer):
total = 0
allids, allembeddings = [], []
batch_ids, batch_text = [], []
with torch.no_grad():
for k, p in enumerate(passages):
batch_ids.append(p["id"])
if args.no_title or not "title" in p:
text = p["text"]
else:
text = p["title"] + " " + p["text"]
if args.lowercase:
text = text.lower()
if args.normalize_text:
text = src.normalize_text.normalize(text)
batch_text.append(text)
if len(batch_text) == args.per_gpu_batch_size or k == len(passages) - 1:
encoded_batch = tokenizer.batch_encode_plus(
batch_text,
return_tensors="pt",
max_length=args.passage_maxlength,
padding=True,
truncation=True,
)
encoded_batch = {k: v.cuda() for k, v in encoded_batch.items()}
embeddings = model(**encoded_batch)
embeddings = embeddings.cpu()
total += len(batch_ids)
allids.extend(batch_ids)
allembeddings.append(embeddings)
batch_text = []
batch_ids = []
if k % 100000 == 0 and k > 0:
print(f"Encoded passages {total}")
if [] != allembeddings:
allembeddings = torch.cat(allembeddings, dim=0).numpy()
return allids, allembeddings
def main(args):
model, tokenizer, _ = src.contriever.load_retriever(args.model_name_or_path)
print(f"Model loaded from {args.model_name_or_path}.", flush=True)
model.eval()
model = model.cuda()
if not args.no_fp16:
model = model.half()
for psg in args.psgs_list:
passages = src.data.load_passages(psg)
shard_size = len(passages) // args.num_shards
start_idx = args.shard_id * shard_size
end_idx = start_idx + shard_size
if args.shard_id == args.num_shards - 1:
end_idx = len(passages)
passages = passages[start_idx:end_idx]
# print(f"Embedding generation for {len(passages)} passages from idx {start_idx} to {end_idx}.")
allids, allembeddings = embed_passages(args, passages, model, tokenizer)
# save_file = os.path.join(args.output_dir, args.prefix + f"_{args.shard_id:02d}")
def get_file_name_without_extension(file_path):
base_name = os.path.basename(file_path) # 获取文件名
file_name_without_extension = os.path.splitext(base_name)[0] # 去除后缀
return file_name_without_extension
fileName = get_file_name_without_extension(psg)
save_file = os.path.join(args.output_dir, fileName)
os.makedirs(args.output_dir, exist_ok=True)
print(f"Saving {len(allids)} passage embeddings to {save_file}.")
with open(save_file, mode="wb") as f:
pickle.dump((allids, allembeddings), f)
print(f"Total passages processed {len(allids)}. Written to {save_file}.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--psgs_list", nargs='+', required=True)
# parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)")
parser.add_argument("--output_dir", type=str, default="wikipedia_embeddings", help="dir path to save embeddings")
parser.add_argument("--prefix", type=str, default="passages", help="prefix path to save embeddings")
parser.add_argument("--shard_id", type=int, default=0, help="Id of the current shard")
parser.add_argument("--num_shards", type=int, default=1, help="Total number of shards")
parser.add_argument(
"--per_gpu_batch_size", type=int, default=512, help="Batch size for the passage encoder forward pass"
)
parser.add_argument("--passage_maxlength", type=int, default=512, help="Maximum number of tokens in a passage")
parser.add_argument(
"--model_name_or_path", type=str, help="path to directory containing model weights and config file"
)
parser.add_argument("--no_fp16", action="store_true", help="inference in fp32")
parser.add_argument("--no_title", action="store_true", help="title not added to the passage body")
parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding")
parser.add_argument("--normalize_text", action="store_true", help="lowercase text before encoding")
args = parser.parse_args()
src.slurm.init_distributed_mode(args)
main(args)
#!/bin/bash
chunk_size=200
work_dir="../../LongBench" # dir for storing data
source_dir="${work_dir}/data" # source LongBench dir
chunk_dir="C${chunk_size}"
split_dir="${work_dir}/${chunk_dir}/split"
embed_dir="${work_dir}/${chunk_dir}/embed"
retrieved_dir="${work_dir}/${chunk_dir}/output"
python LB2mC.py \
--chunk_size ${chunk_size} \
--output_folder ${split_dir}\
--input_folder ${source_dir}
folder_names=()
# Traverse all subfolders under `split` dir
for folder in "$split_dir"/*; do
if [ -d "$folder" ]; then
# get the name of subfolder
folder_name=$(basename "$folder")
# concat
folder_path="$split_dir/$folder_name"
echo "$folder_path"
folder_names+=("$folder_name")
fi
done
# Traverse all subfolders under `split` dir
for folder in "${folder_names[@]}"; do
file_paths=()
# Traverse all files in a subfolder
for file in "$split_dir"/"$folder"/*.tsv; do
if [ -f "$file" ]; then
fileName=$(basename "${file%.*}")
file_paths+=("${split_dir}/${folder}/${fileName}.tsv")
fi
done
# Converts an array to a ' ' separated string
files_str=$(IFS=' '; echo "${file_paths[*]}")
# generate embeddings
python ./contriever/generate_passage_embeddings.py \
--model_name_or_path ./contriever/mcontriever \
--output_dir ${embed_dir}/${folder} \
--psgs_list $files_str\
--shard_id 0 --num_shards 1 \
--lowercase --normalize_text
# generate results of retrieval
tsv_files=("$split_dir/$folder"/*.tsv)
# concurrent execution
group_size=5
for ((start=0; start<${#tsv_files[@]}; start+=group_size)); do
end=$((start + group_size - 1))
echo "Index Range:$start ~ $end"
current_group=("${tsv_files[@]:start:group_size}")
for ((index=0; index<${#current_group[@]}; index+=1)); do
file=${current_group[index]}
fileName=$(basename "${file%.*}")
python ./contriever/passage_retrieval.py \
--model_name_or_path ./contriever/mcontriever \
--passages ${split_dir}/${folder}/${fileName}.tsv \
--passages_embeddings ${embed_dir}/${folder}/${fileName} \
--data ${split_dir}/${folder}/${fileName}.jsonl \
--output_dir ${retrieved_dir}/${folder} \
--lowercase --normalize_text \
--device "cuda" \
&
# --device "cuda:$(expr 4 + $index % 4)" \
done
wait
done
python merge_output.py \
--input_folder "${retrieved_dir}/${folder}" \
--output_file "${work_dir}/${chunk_dir}/mc2LB/${folder}.jsonl" \
--input_dataFile "${source_dir}/${folder}.jsonl" \
--output_dataFile "${work_dir}/${chunk_dir}/data/${folder}.jsonl"
done
cp ../LongBench.py "${work_dir}/${chunk_dir}"
\ No newline at end of file
import os
import json
import argparse
from tqdm import tqdm
import sys
sys.path.append('..')
from splitter import get_word_len
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
parser = argparse.ArgumentParser()
parser.add_argument('--input_folder', type=str, default='mcontriever_output',
help='Path to the input folder containing jsonl files.')
parser.add_argument('--output_file', type=str, default='CONTENT.jsonl',
help='Output jsonl file name.')
parser.add_argument('--input_dataFile', type=str, default='inputData.jsonl',
help='Input datum jsonl file name.')
parser.add_argument('--output_dataFile', type=str, default='DATA.jsonl',
help='Output datum jsonl file name.')
args = parser.parse_args()
def merge_text(jsonl_file, maxLen=1500):
with open(jsonl_file, 'r', encoding='utf-8') as f:
data_list = json.load(f)
context_list = data_list['ctxs']
merged_text = ''
retrieved = []
for item in context_list:
if get_word_len(merged_text) < maxLen:
merged_text += item['text'] + '\n\n'
retrieved += [item['text']]
output_data = {
'context': merged_text,
'id': data_list['id'],
'retrieved': retrieved
}
return output_data
def process_all_jsonl_files(args):
input_folder = args.input_folder
output_file = args.output_file
# data_name = os.path.basename(os.path.normpath(input_folder))
os.makedirs(os.path.dirname(output_file), exist_ok=True)
output_data_list = []
with open(output_file, 'w', encoding='utf-8') as f_out:
# print("input_folder", input_folder)
loop = tqdm(os.listdir(input_folder), desc="merge")
for filename in loop:
if filename.endswith('.jsonl'):
jsonl_file = os.path.join(input_folder, filename)
output_data = merge_text(jsonl_file)
output_data_list += [output_data]
f_out.write(json.dumps(output_data, ensure_ascii=False) + '\n')
os.makedirs(os.path.dirname(args.output_dataFile), exist_ok=True)
with open(args.input_dataFile, 'r', encoding='utf-8') as in_data:
with open(args.output_dataFile, 'w', encoding='utf-8') as out_data:
for line in in_data:
data_l = json.loads(line)
for modified_data in output_data_list:
if data_l['_id'] == modified_data['id']:
data_l['context'] = modified_data['context']
data_l['length'] = get_word_len(data_l['context'])
data_l['retrieved'] = modified_data['retrieved']
break
out_data.write(json.dumps(data_l, ensure_ascii=False) + '\n')
process_all_jsonl_files(args)
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import os
import argparse
import csv
import json
import logging
import pickle
import time
import glob
from pathlib import Path
import numpy as np
import torch
import transformers
import src.index
import src.contriever
import src.utils
import src.slurm
import src.data
from src.evaluation import calculate_matches
import src.normalize_text
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
def embed_queries(args, queries, model, tokenizer):
model.eval()
embeddings, batch_question = [], []
with torch.no_grad():
for k, q in enumerate(queries):
if args.lowercase:
q = q.lower()
if args.normalize_text:
q = src.normalize_text.normalize(q)
batch_question.append(q)
if len(batch_question) == args.per_gpu_batch_size or k == len(queries) - 1:
encoded_batch = tokenizer.batch_encode_plus(
batch_question,
return_tensors="pt",
max_length=args.question_maxlength,
padding=True,
truncation=True,
)
encoded_batch = {k: v.to(args.device) for k, v in encoded_batch.items()}
output = model(**encoded_batch)
embeddings.append(output.cpu())
batch_question = []
embeddings = torch.cat(embeddings, dim=0)
print(f"Questions embeddings shape: {embeddings.size()}")
return embeddings.numpy()
def index_encoded_data(index, embedding_files, indexing_batch_size):
allids = []
allembeddings = np.array([])
for i, file_path in enumerate(embedding_files):
print(f"Loading file {file_path}")
with open(file_path, "rb") as fin:
ids, embeddings = pickle.load(fin)
allembeddings = np.vstack((allembeddings, embeddings)) if allembeddings.size else embeddings
allids.extend(ids)
while allembeddings.shape[0] > indexing_batch_size:
allembeddings, allids = add_embeddings(index, allembeddings, allids, indexing_batch_size)
while allembeddings.shape[0] > 0:
allembeddings, allids = add_embeddings(index, allembeddings, allids, indexing_batch_size)
# print("Data indexing completed.")
def add_embeddings(index, embeddings, ids, indexing_batch_size):
end_idx = min(indexing_batch_size, embeddings.shape[0])
ids_toadd = ids[:end_idx]
embeddings_toadd = embeddings[:end_idx]
ids = ids[end_idx:]
embeddings = embeddings[end_idx:]
index.index_data(ids_toadd, embeddings_toadd)
return embeddings, ids
def validate(data, workers_num):
match_stats = calculate_matches(data, workers_num)
top_k_hits = match_stats.top_k_hits
print("Validation results: top k documents hits %s", top_k_hits)
top_k_hits = [v / len(data) for v in top_k_hits]
message = ""
for k in [5, 10, 20, 100]:
if k <= len(top_k_hits):
message += f"R@{k}: {top_k_hits[k-1]} "
print(message)
return match_stats.questions_doc_hits
def add_passages(data, passages, top_passages_and_scores):
# add passages to original data
merged_data = []
assert len(data) == len(top_passages_and_scores)
for i, d in enumerate(data):
results_and_scores = top_passages_and_scores[i]
docs = [passages[doc_id] for doc_id in results_and_scores[0]]
scores = [str(score) for score in results_and_scores[1]]
ctxs_num = len(docs)
d["ctxs"] = [
{
"id": results_and_scores[0][c],
"title": docs[c]["title"],
"text": docs[c]["text"],
"score": scores[c],
}
for c in range(ctxs_num)
]
def add_hasanswer(data, hasanswer):
# add hasanswer to data
for i, ex in enumerate(data):
for k, d in enumerate(ex["ctxs"]):
d["hasanswer"] = hasanswer[i][k]
def load_data(data_path):
if data_path.endswith(".json"):
with open(data_path, "r") as fin:
data = json.load(fin)
elif data_path.endswith(".jsonl"):
data = []
with open(data_path, "r") as fin:
for k, example in enumerate(fin):
example = json.loads(example)
data.append(example)
return data
def main(args):
print(f"Loading model from: {args.model_name_or_path}")
model, tokenizer, _ = src.contriever.load_retriever(args.model_name_or_path)
model.eval()
model = model.to(args.device)
if not args.no_fp16:
model = model.half()
index = src.index.Indexer(args.projection_size, args.n_subquantizers, args.n_bits)
# index all passages
input_paths = glob.glob(args.passages_embeddings)
input_paths = sorted(input_paths)
embeddings_dir = os.path.dirname(input_paths[0])
index_path = os.path.join(embeddings_dir, "index.faiss")
if args.save_or_load_index and os.path.exists(index_path):
index.deserialize_from(embeddings_dir)
else:
# print(f"Indexing passages from files {input_paths}")
start_time_indexing = time.time()
index_encoded_data(index, input_paths, args.indexing_batch_size)
print(f"Indexing time: {time.time()-start_time_indexing:.1f} s.")
if args.save_or_load_index:
index.serialize(embeddings_dir)
# load passages
passages = src.data.load_passages(args.passages)
passage_id_map = {x["id"]: x for x in passages}
data_paths = glob.glob(args.data)
alldata = []
for path in data_paths:
data = load_data(path)
output_path = os.path.join(args.output_dir, os.path.basename(path))
queries = [ex["question"] for ex in data]
questions_embedding = embed_queries(args, queries, model, tokenizer)
# get top k results
start_time_retrieval = time.time()
# top_ids_and_scores = index.search_knn(questions_embedding, args.n_docs)
top_ids_and_scores = index.search_knn(questions_embedding, len(passages))
print(f"{len(passages)} psgs: Search time: {time.time()-start_time_retrieval:.1f} s.")
add_passages(data, passage_id_map, top_ids_and_scores)
# hasanswer = validate(data, args.validation_workers)
# add_hasanswer(data, hasanswer)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as fout:
for ex in data:
json.dump(ex, fout, ensure_ascii=False)
fout.write("\n")
print(f"Saved results to {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--data",
# required=True,
type=str,
default='./data_longbench/split/2wikimqa/0a64d8873482d91efc595a508218c6ce881c13c95028039e.jsonl',
help=".json file containing question and answers, similar format to reader data",
)
parser.add_argument("--passages", type=str, default='./data_longbench/split/2wikimqa/0a64d8873482d91efc595a508218c6ce881c13c95028039e.tsv',
help="Path to passages (.tsv file)")
parser.add_argument("--passages_embeddings", type=str, default='./data_longbench/mEmbeddings/2wikimqa/0a64d8873482d91efc595a508218c6ce881c13c95028039e.jsonl',
help="Glob path to encoded passages")
parser.add_argument(
"--output_dir", type=str, default=None, help="Results are written to outputdir with data suffix"
)
parser.add_argument("--n_docs", type=int, default=100, help="Number of documents to retrieve per questions")
parser.add_argument(
"--validation_workers", type=int, default=32, help="Number of parallel processes to validate results"
)
parser.add_argument("--per_gpu_batch_size", type=int, default=64, help="Batch size for question encoding")
parser.add_argument(
"--save_or_load_index", action="store_true", help="If enabled, save index and load index if it exists"
)
parser.add_argument(
"--model_name_or_path", type=str, default='./../mcontriever',
help="path to directory containing model weights and config file"
)
parser.add_argument("--no_fp16", action="store_true", help="inference in fp32")
parser.add_argument("--question_maxlength", type=int, default=512, help="Maximum number of tokens in a question")
parser.add_argument(
"--indexing_batch_size", type=int, default=1000000, help="Batch size of the number of passages indexed"
)
parser.add_argument("--projection_size", type=int, default=768)
parser.add_argument(
"--n_subquantizers",
type=int,
default=0,
help="Number of subquantizer used for vector quantization, if 0 flat index is used",
)
parser.add_argument("--n_bits", type=int, default=8, help="Number of bits per subquantizer")
parser.add_argument("--lang", nargs="+")
parser.add_argument("--dataset", type=str, default="none")
parser.add_argument("--lowercase", action="store_true", default=True, help="lowercase text before encoding")
parser.add_argument("--normalize_text", action="store_true", default=True, help="normalize text")
parser.add_argument("--device", type=str, default='cuda', help="normalize text")
args = parser.parse_args()
src.slurm.init_distributed_mode(args)
main(args)
import openai
from openai.embeddings_utils import cosine_similarity
openai.api_key="KEY"
openai.proxy=""
import os
import json
from concurrent.futures import ThreadPoolExecutor, wait
from tqdm import tqdm
import argparse
import sys
sys.path.append("..")
from splitter import split_long_sentence, get_word_len, regex
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
def retriveDoc(query: str, document: str, chunk_size, file_name:str,
js, output_list, idx, pbar=None, maxLen=1500):
# 1. Splits the context into pieces
texts = split_long_sentence(document, regex, chunk_size=chunk_size, filename=file_name)
# 2. Creates retriver, adds texts
# 3. Retrives and merges
# https://platform.openai.com/docs/api-reference/embeddings/object?lang=python
texts_embeddings = openai.Embedding.create(
model="text-embedding-ada-002",
input=texts
)
query_embeddings = openai.Embedding.create(
model="text-embedding-ada-002",
input=query
)
similarity = []
for emb in texts_embeddings['data']:
similarity.append(cosine_similarity(emb['embedding'], query_embeddings['data'][0]['embedding']))
sorted_pairs=sorted(zip(similarity, texts), reverse=True)
retrieved_texts = [pair[1] for pair in sorted_pairs]
retrieved_texts = [retrieved_texts] if type(retrieved_texts) == str else retrieved_texts
context = ''
for text in retrieved_texts:
if get_word_len(context) < maxLen:
context += text
js['retrieved'] = retrieved_texts if type(retrieved_texts) == list else [retrieved_texts]
js['context'] = context
js['length'] = get_word_len(context)
output_list[index] = js
if pbar:
pbar.update()
return
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# parser.add_argument("--file_name", default='dureader.jsonl')
parser.add_argument("--file_name", default='musique.jsonl')
parser.add_argument("--source_dir", default='../source/docqa_only')
parser.add_argument("--dest_dir", default='./test')
parser.add_argument("--chunk_size", type=int, default=200)
args = parser.parse_args()
file_name = args.file_name
print(f"------ {file_name} ------")
with open(os.path.join(args.source_dir, file_name), 'r', encoding='utf-8') as file:
file_contents = file.readlines()
# DEBUG
# file_contents = file_contents[:10]
# with tqdm(total=len(file_contents)) as pbar, ThreadPoolExecutor(max_workers=1) as executor:
output_data = [{}] * len(file_contents)
if (os.path.exists(os.path.join(args.dest_dir, file_name))):
with open(os.path.join(args.dest_dir, file_name), 'r', encoding='utf-8') as f:
lines = f.readlines()
lines = [line for line in lines]
output_data = [json.loads(line) for line in lines]
def saving():
os.makedirs(args.dest_dir, exist_ok=True)
with open(os.path.join(args.dest_dir, file_name), 'w', encoding='utf-8') as output_file:
for item in output_data:
output_file.write(json.dumps(item, ensure_ascii=False) + '\n')
loop = tqdm(enumerate(file_contents), total=len(file_contents), desc=f'{file_name}')
exe_list = []
with ThreadPoolExecutor(max_workers=3) as executor:
# for index, line in loop:
for index, line in enumerate(file_contents):
if (output_data[index] != {} or
"context" in output_data[index].keys() and len(output_data[index]['context']) != 0):
loop.update()
continue
line_js = json.loads(line)
try:
# retriveDoc(query=line_js['input'], document=line_js['context'],
# chunk_size=args.chunk_size, file_name=file_name,
# js=line_js, output_list=output_data, idx=index, pbar=loop)
exe_list.append(executor.submit(retriveDoc, query=line_js['input'], document=line_js['context'],
chunk_size=args.chunk_size, file_name=file_name,
js=line_js, output_list=output_data, idx=index, pbar=loop))
except Exception as e:
saving()
print(e)
wait(exe_list)
saving()
#!/bin/bash
chunk_size=500
work_dir="../../LongBench" # dir for storing data
source_dir="${work_dir}/data" # source LongBench dir
dest_dir=""${work_dir}/E${chunk_size}/data""
file_names=()
allowed_files=("multifieldqa_en.jsonl" "qasper.jsonl" "2wikimqa.jsonl" "dureader.jsonl" "hotpotqa.jsonl" "narrativeqa.jsonl" "musique.jsonl" "multifieldqa_zh.jsonl")
# store all jsonl files
while IFS= read -r -d '' file; do
base_name=$(basename "$file")
# Check if the file name is in the allowed_files list
if [[ " ${allowed_files[@]} " =~ " ${base_name} " ]]; then
file_names+=("$base_name")
fi
done < <(find "$source_dir" -type f -name "*.jsonl" -print0)
# concurrent execution
group_size=3
for ((start=0; start<${#file_names[@]}; start+=group_size)); do
end=$((start + group_size - 1))
echo "Index Range:$start ~ $end"
current_group=("${file_names[@]:start:group_size}")
for file in "${current_group[@]}"; do
fileName=$(basename "${file}")
python generate_openai_embedding.py \
--file_name $fileName \
--source_dir $source_dir \
--dest_dir $dest_dir \
--chunk_size $chunk_size \
&
done
wait
done
cp ../LongBench.py "${work_dir}/E${chunk_size}"
\ No newline at end of file
import os
import json
import argparse
args = argparse.ArgumentParser()
args.add_argument("--data", type=str, default="C200_7")
args.add_argument("--model", type=str, default="chatglm2-6b")
args = args.parse_args()
import sys
sys.path.append("..")
from metrics import (
qa_f1_score,
rouge_zh_score,
qa_f1_zh_score,
rouge_score,
classification_score,
retrieval_score,
retrieval_zh_score,
count_score,
code_sim_score,
)
dataset2metric = {
"hotpotqa": qa_f1_score,
"2wikimqa": qa_f1_score,
"musique": qa_f1_score,
"dureader": rouge_zh_score,
"narrativeqa": qa_f1_score,
"qasper": qa_f1_score,
"multifieldqa_en": qa_f1_score,
"multifieldqa_zh": qa_f1_zh_score,
"gov_report": rouge_score,
"qmsum": rouge_score,
"vcsum": rouge_zh_score,
"trec": classification_score,
"nq": qa_f1_score,
"triviaqa": qa_f1_score,
"lsht": classification_score,
"passage_retrieval_en": retrieval_score,
"passage_count": count_score,
"passage_retrieval_zh": retrieval_zh_score,
"lcc": code_sim_score,
"repobench-p": code_sim_score,
}
def scorer(dataset, predictions, answers, all_classes):
total_score = 0.
for (prediction, ground_truths) in zip(predictions, answers):
score = 0.
for ground_truth in ground_truths:
score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
total_score += score
return round(100 * total_score / len(predictions), 2)
if __name__ == '__main__':
scores = dict()
all_files = os.listdir(f"{args.model}_pred_{args.data}")
for filename in all_files:
predictions, answers = [], []
dataset = filename.split('.')[0]
with open(f"{args.model}_pred_{args.data}/{filename}", "r", encoding='utf-8') as f:
for line in f:
data = json.loads(line)
predictions.append(data["pred"])
answers.append(data["answers"])
all_classes = data["all_classes"]
score = scorer(dataset, predictions, answers, all_classes)
scores[dataset] = score
os.makedirs(f"result_{args.model}", exist_ok=True)
with open(f"result_{args.model}/{args.data}.json", "w", encoding='utf-8') as f:
json.dump(scores, f, ensure_ascii=False, indent=4)
import os
from datasets import load_dataset
import torch
import json
from transformers import AutoTokenizer, AutoModel, LlamaTokenizer, LlamaForCausalLM, AutoModelForCausalLM
from tqdm import tqdm
import argparse
# DEBUG
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="chatglm2-6b")
parser.add_argument("--top_k", type=int, default=3)
parser.add_argument("--data", type=str, default="B500")
return parser.parse_args(args)
# This is the customized building prompt for chat models, here is an example for ChatGLM2
def build_chat(tokenizer, prompt, model_name):
if "chatglm" in model_name:
prompt = tokenizer.build_prompt(prompt)
elif "longchat" in model_name or "vicuna" in model_name:
from fastchat.model import get_conversation_template
conv = get_conversation_template("vicuna")
conv.append_message(conv.roles[0], prompt)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
elif "llama2" in model_name:
prompt = f"[INST]{prompt}[/INST]"
elif "xgen" in model_name:
header = (
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
)
prompt = header + f" ### Human: {prompt}\n###"
elif "internlm" in model_name:
prompt = f"<|User|>:{prompt}<eoh>\n<|Bot|>:"
return prompt
def get_pred(model, tokenizer, data, max_length, max_gen, prompt_format, dataset, device, model_name, args):
preds = [{}] * len(data)
if os.path.exists(f"{args.model}_pred_{args.data}_{args.top_k}/{dataset}.jsonl"):
with open(f"{args.model}_pred_{args.data}_{args.top_k}/{dataset}.jsonl", "r", encoding="utf-8") as f:
for index, item in enumerate(f):
preds[index] = json.loads(item)
for index, json_obj in enumerate(tqdm(data, desc=f"{dataset}")):
if preds[index] != {}:
continue
if args.top_k != 0:
json_obj['context'] = "".join(json_obj['retrieved'][:args.top_k])
prompt = prompt_format.format(**json_obj)
prompt = build_chat(tokenizer, prompt, model_name)
if "chatgpt" in model_name:
output = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
messages=[{"role": "user", "content": prompt}], max_tokens=max_gen,
temperature=1.0)
pred = output['choices'][0]['message']['content']
context_length = output['usage']['prompt_tokens']
else:
# truncate to fit max_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions)
tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
if len(tokenized_prompt) > max_length:
half = int(max_length/2)
prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
if dataset not in ["trec", "triviaqa", "samsum", "lsht", "lcc", "repobench-p"]: # chat models are better off without build prompt on these tasks
prompt = build_chat(tokenizer, prompt, model_name)
input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
context_length = input.input_ids.shape[-1]
if dataset == "samsum": # prevent illegal output on samsum (model endlessly repeat "\nDialogue"), might be a prompting issue
output = model.generate(
**input,
max_new_tokens=max_gen,
num_beams=1,
do_sample=False,
temperature=1.0,
min_length=context_length+1,
eos_token_id=[tokenizer.eos_token_id, tokenizer.encode("\n", add_special_tokens=False)[-1]],
)[0]
else:
output = model.generate(
**input,
max_new_tokens=max_gen,
num_beams=1,
do_sample=False,
temperature=1.0,
)[0]
pred = tokenizer.decode(output[context_length:], skip_special_tokens=True)
pred = post_process(pred, model_name)
preds[index] = {"pred": pred, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"],
"context_length": context_length}
with open(f"{args.model}_pred_{args.data}_{args.top_k}/{dataset}.jsonl", "w", encoding="utf-8") as f:
for pred in preds:
json.dump(pred, f, ensure_ascii=False)
f.write('\n')
return preds
def post_process(response, model_name):
if "xgen" in model_name:
response = response.strip().replace("Assistant:", "")
elif "internlm" in model_name:
response = response.split("<eoa>")[0]
return response
def load_model_and_tokenizer(model2path, model_name, device):
if "chatgpt" in model_name:
return model_name, model_name
else:
if "chatglm" in model_name or "internlm" in model_name or "xgen" in model_name:
tokenizer = AutoTokenizer.from_pretrained(model2path[model_name], trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model2path[model_name], trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
elif "llama2" in model_name:
tokenizer = LlamaTokenizer.from_pretrained(model2path[model_name])
model = LlamaForCausalLM.from_pretrained(model2path[model_name], torch_dtype=torch.bfloat16).to(device)
elif "longchat" in model_name or "vicuna" in model_name:
from fastchat.model import load_model
model, _ = load_model(
model2path[model_name],
device='cpu',
num_gpus=0,
load_8bit=False,
cpu_offloading=False,
debug=False,
)
model = model.to(device)
model = model.bfloat16()
tokenizer = AutoTokenizer.from_pretrained(model2path[model_name], trust_remote_code=True, use_fast=False)
model = model.eval()
return model, tokenizer
if __name__ == '__main__':
args = parse_args()
model_name = args.model
if "chatgpt" in model_name:
import openai
# openai.api_base=""
openai.api_key = "YOUR_KEY"
# Retrieval is fit for these datasets
datasets = ["multifieldqa_en", "qasper", "2wikimqa", "dureader", \
"hotpotqa", "narrativeqa", "musique", "multifieldqa_zh"]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# load configs
model2path = json.load(open("../config/model2path.json", "r"))
model2maxlen = json.load(open("../config/model2maxlen.json", "r"))
# we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output
dataset2prompt = json.load(open("../config/dataset2prompt.json", "r"))
dataset2maxlen = json.load(open("../config/dataset2maxlen.json", "r"))
# define your model
model, tokenizer = load_model_and_tokenizer(model2path, model_name, device)
max_length = model2maxlen[model_name]
# predict on each dataset
os.makedirs(f"{args.model}_pred_{args.data}_{args.top_k}", exist_ok=True)
for dataset in datasets:
data = load_dataset(f'../LongBench/{args.data}/LongBench.py', dataset, split='test',
download_mode='force_redownload') # force to load from dir
prompt_format = dataset2prompt[dataset]
max_gen = dataset2maxlen[dataset]
preds = get_pred(model, tokenizer, data, max_length, max_gen, prompt_format, dataset, device, model_name, args)
\ No newline at end of file
rank_bm25
openai
beir
\ No newline at end of file
import re
def split_long_sentence(sentence, regex, chunk_size=200, filename='Unknown'):
chunks = []
sentences = re.split(regex, sentence)
current_chunk = ""
for s in sentences:
if current_chunk and get_word_len(current_chunk) + get_word_len(s) <= chunk_size:
current_chunk += ' ' if s == '' else s
else:
if current_chunk:
chunks.append(current_chunk)
# if (len(current_chunk) > chunk_size*5):
current_len = get_word_len(current_chunk)
if (current_len > chunk_size * 1.5):
print(f"\n{filename}-{len(chunks)-1} Chunk size: {current_len}")
current_chunk = s
if current_chunk:
chunks.append(current_chunk)
return chunks
def get_word_list(s1):
# Separate sentences by word, Chinese by word, English by word, numbers by space
regEx = re.compile('[\W]')
res = re.compile(r"([\u4e00-\u9fa5])") # [\u4e00-\u9fa5] for Chinese
p1 = regEx.split(s1.lower())
str1_list = []
for str in p1:
if res.split(str) == None:
str1_list.append(str)
else:
ret = res.split(str)
for ch in ret:
str1_list.append(ch)
list_word1 = [w for w in str1_list if len(w.strip()) > 0]
return list_word1
def get_word_len(s1):
return len(get_word_list(s1))
regex = r'([。?!;\n.!?;]\s*)'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment