Skip to content
Snippets Groups Projects
Unverified Commit 98627aad authored by Jiajie Zhang's avatar Jiajie Zhang Committed by GitHub
Browse files

Update retrieve.py

parent 89b8f1ba
No related branches found
No related tags found
No related merge requests found
......@@ -13,7 +13,7 @@ tokenizer = AutoTokenizer.from_pretrained("THUDM/LongCite-glm4-9b", trust_remote
def text_split_by_punctuation(original_text, return_dict=False):
# text = re.sub(r'([a-z])\.([A-Z])', r'\1. \2', original_text) # separate period without space
text = original_text
custom_sent_tokenizer = PunktSentenceTokenizer(text)
custom_sent_tokenizer = PunktSentenceTokenizer()
punctuations = r"([。;!?])" # For Chinese support
separated = custom_sent_tokenizer.tokenize(text)
......@@ -118,4 +118,4 @@ def batch_search(queries, contexts, k=20):
'query': queries[i],
'retrieve_results': chunks
})
return res
\ No newline at end of file
return res
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment