diff --git a/tests/integration/57640.4032.txt b/tests/integration/57640.4032.txt
new file mode 100644
index 0000000000000000000000000000000000000000..292700b19ffb90cecb3c5119f5b8607c9e7d8cb3
--- /dev/null
+++ b/tests/integration/57640.4032.txt
@@ -0,0 +1,222 @@
+3 2 0 2
+t c O 9 2 ] L C . s c [
+3 v 5 7 6 4 0 . 4 0 3 2 : v i X r a
+Multilingual Machine Translation with Large Language Models: Empirical Results and Analysis Wenhao Zhu1,2Ã¢ÂˆÂ—, Hongyi Liu3Ã¢ÂˆÂ—, Qingxiu Dong4, Jingjing Xu2 Shujian Huang1 , Lingpeng Kong5, Jiajun Chen1, Lei Li6 1 National Key Laboratory for Novel Software Technology, Nanjing University 2 Shanghai AI Lab 3 Shanghai Jiao Tong University 4 Peking University 5 The University of Hong Kong 6 University of California, Santa Barbara zhuwh@smail.nju.edu.cn, liu.hong.yi@sjtu.edu.cn, dqx@stu.pku.edu.cn, jingjingxu@pku.edu.cn huangsj@nju.edu.cn, lpk@cs.hku.hk, chenjj@nju.edu.cn, lilei@cs.ucsb.edu
+Jiajun Chen', Lei Technology, Nanjing University University 4 Peking University of California, Santa Barbara dqx@stu.pku.edu.cn, jingjingxu@pku.edu.cn chenjj@nju.edu.cn, lilei@cs.ucsb.edu Multilingual Translation Performance Indo-Euro-Indo-Aryan Indo-Ã¢Â‚Â¬uro-Other Indo-Euro-Slavic Austronesian Indo-Euro-Romance Atlantic-Congo Indo-Euro-Germanic. Afro-Asiatic Other Falcon-78. LLaMa2-78 = ChatGPT Ã¢Â€Â” GPr4 Sino-Tibetan Ã¢Â€Â” NULB.1.38 Dravidian == Google Translate
+# Abstract
+Large language models (LLMs) have demon- strated remarkable potential in handling mul- tilingual machine translation (MMT). In this paper, we systematically investigate the advan- tages and challenges of LLMs for MMT by an- swering two questions: 1) How well do LLMs perform in translating massive languages? 2) Which factors affect LLMsÃ¢Â€Â™ performance in translation? We thoroughly evaluate eight pop- ular LLMs, including ChatGPT and GPT-4. Our empirical results show that translation ca- pabilities of LLMs are continually improving. GPT-4 has beat the strong supervised baseline NLLB in 40.91% of translation directions but still faces a large gap towards the commercial translation system, especially on low-resource languages. Through further analysis, we dis- cover that LLMs exhibit new working patterns when used for MMT. First, instruction seman- tics can surprisingly be ignored when given in-context exemplars. Second, cross-lingual ex- emplars can provide better task guidance for low-resource translation than exemplars in the same language pairs. Third, LLM can acquire translation ability in a resource-efficient way and generate moderate translation even on zero- resource languages 1.
+Figure 1: Multilingual translation performance (trans- lating from English to non-English) of some popular LLMs and traditional supervised systems. LLMs have demonstrated great potential in multilingual machine translation.
+models are not particularly optimized on multilin- gual data.
+However, the multilingual translation ability of LLMs remains under-explored. MMT is a challeng- ing task that involves translating text among dif- ferent languages and requires semantic alignment between languages (Fan et al., 2021; Costa-jussÃƒ  et al., 2022; Yuan et al., 2023). It is also unclear that how LLM acquires translation ability and which factors affect LLMÃ¢Â€Â™s translation ability.
+# Introduction
+With the increasing scale of parameters and training corpus, large language models (LLMs) have gained a universal ability to handle a variety of tasks via in-context learning (ICL, Brown et al. 2020), which allows language models to perform tasks with a few given exemplars and human-written instructions as context. One particular area where LLMs have shown outstanding potential is machine translation (MT). Previous studies have shown the surprising performance of LLMs on high-resource bilingual translation, such as English-German translation (Vi- lar et al., 2022; Zhang et al., 2022), even if these
+1Code will be released at: https://github.com/ NJUNLP/MMT-LLM.
+In this paper, we follow ICL paradigm and focus on studying LLMs in multilingual machine trans- lation by answering two questions: 1) How LLMs perform MMT over massive languages? 2) Which factors affect the performance of LLMs?
+For the first question, we evaluate several pop- ular LLMs: English-centric LLMs, including OPT (Zhang et al., 2022), LLaMA2 (Touvron et al., 2023), Falcon (Almazrouei et al., 2023) and multilingual LLMs, including XGLM (Lin et al., 2022), BLOOMZ (Scao et al., 2022), Chat- GPT (OpenAI, 2022), GPT-4 (OpenAI, 2023), and consider 102 languages, 606 translation direc-
+tions (202 English-centric directions, 202 French- centric directions and 202 Chinese-centric direc- tions). Results show that the multilingual transla- tion capabilities of LLMs are continually improv- ing and GPT-4 reaches new performance height. Compared with the widely-used supervised MMT system NLLB (Costa-jussÃƒ  et al., 2022), GPT-4 achieves higher performance on 40.91% English- centric translation directions. But compared with the commercial translation system (Google Trans- lator), LLMs still have a long way to go, partic- ularly when it comes to low-resource languages. French-centric and Chinese-centric translation are more challenging for GPT-4 than English-centric translation, which further indicates its unbalanced capability across languages.
+For the second question, we find some new work- ing patterns. First, LLMs are able to perform trans- lation even with unreasonable instructions if in- context learning exemplars are given. However, if given mismatched translation pairs as in-context exemplars, LLMs fail to translate, which is similar to observations from concurrent studies (Wei et al., 2023). This shows the importance of exemplars in ICL for machine translation. Second, we find that cross-lingual translation pairs can be surprisingly good exemplars for low-resource translation, even better than exemplars in the same language. Third, we discover that LLM can acquire translation abil- ity in a resource-efficient way and generate moder- ate translation even on zero-resource languages.
+The main contribution of this paper can be sum- marized below:
+Ã¢Â€Â¢ We benchmark popular LLMs on MMT in 102 languages and 606 translation directions, covering English-centric, French-centric and Chinese-centric translation.
+Ã¢Â€Â¢ We systematically compare the results of LLMs and three strong supervised base- lines (M2M-100, NLLB, Google Translator) and reveal the gap between two translation paradigms.
+Ã¢Â€Â¢ We find some new ICL working patterns of LLMs for MMT and discuss corresponding advantages and challenges.
+# 2 Background
+# 2.1 Large Language Models
+Language modeling is a long-standing task in nat- ural language processing (Bengio et al., 2000;
+Mikolov et al., 2010; Khandelwal et al., 2020), which is a task to predict the probability of the next token. Transformer (Vaswani et al., 2017) basically is the backbone of existing LLMs.
+LLMs show great potential as a universal multi- task learner. Recently, Radford et al. (2019) find that a casual decoder-only language model can be a multi-task learner with merely unsupervised train- ing corpus. Later, Kaplan et al. (2020) reveal the scaling law of LLM, indicating that when the scale of neural parameters and training data keeps in- creasing, LLM can be further strengthened. Wei et al. (2022b) show that scaling the language model also brings astonishing emergent abilities, e.g., in- context learning, which is only present in large models. Consequently, more and more efforts have been put into scaling-up language models (Brown et al., 2020; Hoffmann et al., 2022; Scao et al., 2022; Vilar et al., 2022; Ren et al., 2023). Among them, GPT-4 (OpenAI, 2023) and ChatGPT (Ope- nAI, 2022) are the most representative systems, which shows impressive results in various NLP tasks.
+# 2.2 Emergent Ability: In-context Learning
+In-context learning is one of the well-known emer- gent abilities (Brown et al., 2020; Dong et al., 2022), which enables LLM to learn target tasks according to the prompt without updating any pa- rameters.
+Specifically, the prompt is made up of in-context exemplars {(Xi, Yi)}k i=1 and in-context template T . Exemplars are often picked from supervised data, where Yi is the ground truth corresponding to the input sentence Xi. Template T is usually a human-written instruction related to the target task. Wrapping exemplars with the template and concate- nating them together produce the final prompt:
+P = T (X1, Y1) Ã¢ÂŠÂ• T (X2, Y2) Ã¢ÂŠÂ• Ã‚Â· Ã‚Â· Ã‚Â· Ã¢ÂŠÂ• T (Xk, Yk)
+where Ã¢ÂŠÂ• denotes the concatenation symbol, e.g., whitespace, line-break. During inference, LLM is able to generate the corresponding output Y of the test sample X under the guidance of the prompt:
+arg max Y p(P Ã¢ÂŠÂ• T (X , Y)) (1)
+For label prediction tasks, the prediction Y can be obtained in one-step generation. For sequence generation tasks, e.g., machine translation, the pre- diction Y can be obtained through sampling strate- gies like greedy search and beam search.
+Language Family Direction XGLM-7.5B OPT-175B Falcon-7B Translation Performance (BLEU / COMET) LLaMA2-7B LLaMA2-7B-Chat ChatGPT GPT-4 M2M-12B NLLB-1.3B Google Indo-Euro-Germanic (8) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 18.54 / 70.09 9.16 / 50.21 34.65 / 83.71 18.89 / 71.97 27.37 / 67.40 13.19 / 52.93 37.28 / 84.73 22.78 / 76.05 34.82 / 84.25 19.44 / 73.63 45.83 / 89.05 36.34 / 87.83 48.51 / 89.48 40.64 / 88.50 42.72 / 87.74 37.30 / 86.47 46.54 / 88.18 38.47 / 87.31 51.16 / 89.36 45.27 / 89.05 Indo-Euro-Romance (8) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 31.11 / 79.67 21.95 / 69.08 38.93 / 87.75 24.30 / 79.07 34.06 / 84.40 20.02 / 70.36 41.10 / 88.10 27.81 / 82.05 37.84 / 87.80 25.50 / 79.67 45.68 / 89.61 41.35 / 89.00 47.29 / 89.74 44.47 / 88.94 42.33 / 88.31 42.98 / 87.56 46.33 / 88.99 43.48 / 88.12 35.69 / 89.66 37.10 / 88.77 Indo-Euro-Slavic (12) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 13.20 / 64.24 6.40 / 43.28 20.83 / 74.80 8.18 / 54.45 13.15 / 57.34 4.34 / 35.73 34.00 / 84.90 20.24 / 76.30 30.94 / 83.90 16.14 / 69.75 39.27 / 87.74 32.61 / 87.90 41.19 / 88.15 36.06 / 89.15 35.87 / 85.97 35.01 / 86.43 39.23 / 87.08 36.56 / 88.74 43.61 / 88.18 42.75 / 90.05 Indo-Euro-Indo-Aryan (10) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 8.68 / 63.93 4.76 / 40.99 1.20 / 49.37 0.14 / 31.85 1.40 / 45.22 0.13 / 25.84 6.68 / 62.63 1.61 / 35.92 4.29 / 60.29 1.24 / 34.74 25.32 / 84.14 16.50 / 68.43 37.30 / 87.79 21.35 / 73.75 17.53 / 69.66 14.44 / 65.32 40.75 / 88.80 34.04 / 82.55 45.66 / 89.43 39.04 / 82.78 Indo-Euro-Other (11) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 7.32 / 55.29 4.51 / 40.60 7.80 / 59.60 3.10 / 40.04 7.04 / 51.59 3.38 / 34.64 14.27 / 69.87 5.00 / 44.09 11.46 / 67.64 4.83 / 43.73 29.54 / 84.52 22.81 / 77.33 37.29 / 86.76 28.45 / 80.94 22.38 / 77.47 19.71 / 74.90 36.16 / 86.81 31.65 / 85.82 41.68 / 88.29 38.54 / 87.44 Austronesian (6) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 16.19 / 78.80 10.01 / 73.14 25.60 / 78.03 10.68 / 64.97 18.62 / 75.36 8.56 / 60.89 26.70 / 80.21 14.59 / 74.80 24.39 / 80.39 13.29 / 74.88 39.95 / 87.29 30.17 / 86.36 46.81 / 88.65 34.66 / 87.68 31.84 / 84.76 27.03 / 86.83 45.41 / 87.85 37.17 / 88.82 50.68 / 88.89 40.74 / 89.34 Atlantic-Congo (14) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 6.67 / 62.00 2.52 / 54.93 9.17 / 57.59 1.60 / 34.15 6.98 / 0.56 1.89 / 0.34 8.76 / 57.72 2.45 / 34.17 9.01 / 57.86 3.09 / 38.13 19.86 / 79.63 8.91 / 75.26 28.27 / 83.42 13.70 / 77.79 10.55 / 76.43 6.53 / 75.79 32.20 / 84.00 21.99 / 79.95 23.55 / 85.44 16.77 / 80.89 Afro-Asiatic (6) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 6.70 / 54.51 2.07 / 41.48 5.93 / 52.90 1.40 / 41.86 4.87 / 38.62 1.40 / 27.64 10.41 / 57.72 3.22 / 43.04 8.65 / 58.27 3.07 / 43.39 20.84 / 70.39 13.57 / 67.60 30.48 / 78.76 19.36 / 75.56 10.00 / 66.98 7.83 / 68.86 32.69 / 82.99 26.08 / 82.84 36.14 / 84.47 31.00 / 83.78 Turkic (5) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 7.43 / 61.69 3.48 / 40.32 7.89 / 62.47 2.58 / 44.80 4.15 / 33.11 1.75 / 20.00 9.51 / 65.95 3.28 / 39.65 8.88 / 66.15 3.09 / 41.97 24.64 / 84.04 17.13 / 74.77 31.73 / 86.90 20.96 / 78.50 10.25 / 58.52 10.87 / 68.21 32.92 / 87.51 30.17 / 88.47 37.78 / 88.53 36.54 / 89.38 Dravidian (4) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 8.04 / 61.95 5.30 / 48.15 0.89 / 44.01 0.02 / 32.51 1.18 / 24.29 0.03 / 15.31 2.65 / 53.17 0.56 / 34.03 1.52 / 52.95 0.58 / 35.65 20.26 / 82.00 12.34 / 64.74 33.10 / 86.91 18.60 / 75.15 10.26 / 63.77 6.85 / 62.25 39.07 / 88.42 37.33 / 86.32 43.17 / 89.10 44.16 / 87.75 Sino-Tibetan (3) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 9.35 / 58.60 10.14 / 74.16 9.32 / 65.32 2.57 / 54.73 16.59 / 72.34 10.74 / 66.74 18.35 / 74.45 12.24 / 65.99 16.88 / 74.20 9.06 / 65.07 21.36 / 78.52 19.92 / 76.04 27.74 / 84.48 22.81 / 81.11 11.09 / 71.35 10.42 / 73.82 30.88 / 86.50 16.85 / 80.74 35.68 / 87.66 32.40 / 88.52 Other (14) XÃ¢Â‡Â’Eng EngÃ¢Â‡Â’X 9.71 / 60.43 8.42 / 51.57 10.10 / 60.78 3.82 / 46.85 5.37 / 47.38 1.73 / 29.73 16.00 / 71.15 8.19 / 53.20 14.25 / 70.35 7.14 / 52.12 25.59 / 82.48 20.26 / 74.31 32.62 / 86.21 24.04 / 79.59 25.53 / 81.53 23.29 / 77.80 35.06 / 86.86 28.54 / 85.84 36.95 / 87.93 34.34 / 87.82
+Table 1: Average translation performance of LLMs on different language families. The number in the bracket indicates the number of evaluated languages in the specific language family. Bold text denotes the highest BLEU or COMET score across models. Underlined text denotes the highest BLEU or COMET score across LLMs.
+# 3 Experiment Setup
+Dataset We benchmark multilingual translation on FLORES-101 (Goyal et al., 2022) dataset2, which enables an assessment of model quality on a wide range of languages.
+Supervised baselines We report the performance of the supervised model M2M-100-12B (Fan et al., 2021) and NLLB-1.3B (Costa-jussÃƒ  et al., 2022) (distillation version), which are widely-used many- to-many MMT models. We also report the per- formance of the powerful commercial translation system, Google Translator4.
+LLMs We evaluate translation performance of eight popular LLMs: XGLM-7.5B (Lin et al., 2022), OPT-175B (Zhang et al., 2022), BLOOMZ- 7.1B (Scao et al., 2022), Falcon-7B (Almazrouei et al., 2023), LLaMA2-7B (Touvron et al., 2023), LLaMA2-7B-chat (Touvron et al., 2023), Chat- GPT (OpenAI, 2022) and GPT-4 (OpenAI, 2023).
+Metric Following Goyal et al. (2022), we use SentencePiece BLEU5 (spBLEU) as evaluation metric, which enables an evaluation of all lan- guages. In addition, we also consider emerg- ing metrics, COMET6 (Rei et al., 2020) and SEScore7 (Xu et al., 2022b), which have been shown to correlate well with human judgements.
+ICL strategy For each model, we report its trans- lation performance with eight randomly-picked translation pairs from the corresponding develop- ment set as in-context exemplars and Ã¢Â€Âœ<X>=<Y>Ã¢Â€Â as in-context template. Ã¢Â€Âœ<X>Ã¢Â€Â and Ã¢Â€Âœ<Y>Ã¢Â€Â are the placeholder for the source and target sentence. We use line-break as the concatenation symbol. Ac- cording to our experiment analysis, this ICL strat- egy serves as a simple but strong recipe. All imple- mentation is based on OpenICL3 (Wu et al., 2023).
+# 4 Benchmarking LLMs for Massively Multilingual Machine Translation
+In this section, we report results on multilingual machine translation and introduce our main find- ings about LLMsÃ¢Â€Â™ translation ability.
+The multilingual translation capabilities of LLMs are continually improving. Table 1 presents evaluation results8 grouped by language
+2We evaluate LLMs on the first 100 sentences of each directionÃ¢Â€Â™s test set in benchmarking experiment, considering the prohibitive API cost of evaluating massive languages. In analysis experiment, we use full test set.
+3https://github.com/Shark-NLP/OpenICL
+4https://translate.google.com/ 5https://github.com/mjpost/sacrebleu 6We compute the score with wmt22-comet-da model. 7We compute the score with SEScore-2 (Xu et al., 2022a). 8Evaluating with SEScore leads to similar findings, thus we report those results in Appendix A. Detailed results for
+xeeng = cera mm Chater mm NLLE. mm Google Eng>X
+Figure 2: Translation performance (BLEU) of GPT-4, ChatGPT, NLLB and Google Translator on our evaluated languages. Ã¢Â€ÂœX->EngÃ¢Â€Â and Ã¢Â€ÂœEng->XÃ¢Â€Â denote translating to English and translating from English respectively. In each subfigure, languages are sorted according to BLEU scores of GPT-4.
+family. Monolingual pre-trained LLMs present im- pressive multilingual translation ability, indicating the possibility of aligning multiple languages even with unsupervised data (Garcia et al., 2023). More encouragingly, the multilingual translation capa- bilities of LLMs are continually improving. The most recent LLMs are reaching new performance heights; for example, LLaMA2-7B outperforms previously released open-source LLMs, and GPT- 4 surpasses ChatGPT. Overall, GPT-4 is the best translator among evaluated LLMs and it achieves the highest average BLEU and COMET score on most directions.
+Language Family XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Fra XÃ¢Â‡Â’Zho EngÃ¢Â‡Â’X FraÃ¢Â‡Â’X ZhoÃ¢Â‡Â’X Indo-Euro-Germanic (8) 48.51 44.23 27.97 40.64 32.34 24.13 Indo-Euro-Romance (8) 47.29 45.16 27.31 44.47 36.05 27.12 Indo-Euro-Slavic (12) 41.19 40.32 25.67 36.06 30.88 23.33 Indo-Euro-Indo-Aryan (10) 37.30 32.81 21.81 21.35 17.26 13.55 Indo-Euro-Other (11) 37.29 35.36 22.70 28.45 22.57 17.50 Austronesian (6) 46.81 39.98 24.40 34.66 25.64 19.52 Atlantic-Congo (14) 28.27 25.02 15.72 13.70 10.42 7.60 Afro-Asiatic (6) 30.48 27.00 17.81 19.36 14.43 10.53 Turkic (5) 31.73 30.90 19.96 20.96 17.80 14.02 Dravidian (4) 33.10 30.61 20.63 18.60 14.47 11.37 Sino-Tibetan (3) 27.74 27.93 20.88 22.81 19.21 16.30 Other (14) 32.62 31.26 21.25 24.04 20.03 16.37
+Table 2: Translation performance (BLEU) of GPT-4 on English-centric, French-centric and Chinese-centric translation.
+LLMÃ¢Â€Â™s capability is unbalanced across lan- guages In Table 1, we observe a similar trend for all evaluated LLMs: they perform better at translating into English than translating into non- English. LLMÃ¢Â€Â™s capability on non-English lan- guages is also unbalanced. For languages that are similar to English, e.g, Indo-European-Germanic languages, LLMs achieve impressive results. For languages that are dissimilar to English, e.g., Sino- Tibetan languages, LLMs often produce less decent results.
+Table 2 presents another clue, where we evaluate GPT-4 on French-centric and Chinese-centric trans- lation. Compared to English-centric translation, GPT-4 faces greater challenge when it comes to non-English-centric translation, which again indi- cates LLMÃ¢Â€Â™s unbalanced translation ability across languages.
+LLMs still lag behind the strong supervised baseline, especially on low-resource languages Figure 2 shows the translation performance of the supervised systems and GPT-4 on each lan- guage. In 40.91% translation directions, GPT-4 has achieved higher BLEU scores than NLLB, indi- cating the promising future of this new translation paradigm. But on long-tail low-resource languages, GPT-4 still lags behind NLLB, let alone Google Translator.
+each translation direction are listed in Appendix B.
+im Lu ae un SPLES ELE ng Nuh mm mm eeu
+un SPLES ELE 3: Translation performance (BLEU) of XGLM on to English pre-training corpus. In each subfigure, Eng-Zho XGLM-7.5B mm Falcon-78 mm LLaMA2-78 mm LLata2-78-chat al No-leakage mm Pra mmm Google se BLOOMZ-7.18 BLEU af FLORES-101 5
+Figure 3: Translation performance (BLEU) of XGLM on evaluated languages and the corpus size of each language relative to English pre-training corpus. In each subfigure, languages are sorted according to BLEU scores of XGLM.
+performance across both datasets. This disparity underscores the risk of using FLORES-101 for eval- uating BLOOMZ. Through this example, we wish to draw the communityÃ¢Â€Â™s attention to the potential data leakage issue when evaluating large language models.
+# 5 Analyzing Factors That Influence LLMÃ¢Â€Â™s Translation Performance
+Figure 4: Translation performance of different models on FLORES-101 test set and our annotated no-leakage evaluation set NEWS2023.
+Data leakage issue should be considered before evaluating LLMs on public datasets. We do not include BLOOMZÃ¢Â€Â™s performance on FLORES- 101 in our report because BLOOMZ is instruction- tuned with XP3 dataset (Scao et al., 2022), which includes FLORES-200 dataset. Thus BLOOMZ may have been exposed to test cases from FLORES- 101 during training. If so, the evaluation results can not precisely reflect its translation ability (Elan- govan et al., 2021).
+To illustrate this concern, we take 1000 English sentences from the most recent news spanning August 2023 to October 20239, and ask human experts to translate them into Chinese and con- struct a bilingual no-leakage evaluation set, named NEWS2023. Figure 4 shows that BLOOMZÃ¢Â€Â™s per- formance significantly deteriorates on this no leak- age set, whereas other models maintain a consistent
+9The news were collected from BBC news, Fox news, ABC news and Yahoo news.
+To better understand how LLM acquires transla- tion ability and which factors have influence on its performance, we conduct in-depth analysis. For analysis, we choose XGLM-7.5B as an example10. Note that, when studying a certain factor, we keep the remaining factors unchanged.
+# 5.1 Findings on Pre-training Corpus Size
+LLM can acquire translation ability in a resource-efficient way. As XGLM authors re- port data distribution of their pre-training corpus, we can investigate the relationship between trans- lation performance and corpus size (Figure 3). We find that for low-resource languages, e.g., Catalan (cat) and Swahili (swh), XGLM can generate mod- erate translation, showing that LLM can build bilin- gual mapping between non-English and English
+10We choose XGLM for three reasons: (1) XGLM has a multilingual focus and covers many languages, which can be seen as a representative of multilingual LLM. (2) XGLM-7.5B is an open-source medium-sized LLM. It is more affordable to run experiments with it than large-sized LLM or close- source LLM. (3) The composition of the XGLMÃ¢Â€Â™s pre-training corpus is clear, allowing us to analyze the relationship between translation ability and corpus size.
+In-context Template Deu-Eng Eng-Deu Rus-Eng Eng-Rus Rus-Deu Deu-Rus Average reasonable instructions: <X>=<Y> <X> 
+ Translate from [SRC] to [TGT]: 
+ <Y> <X> 
+ Translate to [TGT]: 
+ <Y> <X> 
+ [TGT]: <Y> <X> is equivalent to <Y> <X>
+ can be translated to
+ <Y> [SRC]: <X> 
+ [TGT]: <Y> 37.37 37.95 37.69 29.94 23.00 37.55 16.95 26.49 26.29 25.84 17.99 4.21 26.49 8.90 29.66 29.83 29.96 25.22 17.76 29.82 14.48 22.25 20.61 19.61 16.29 9.44 22.14 6.88 17.66 17.56 17.44 12.28 8.14 17.48 7.86 17.31 15.93 16.48 11.71 9.84 16.40 4.01 25.12 24.70 24.50 18.91 12.07 24.98 9.85 unreasonable instructions: <X>$<Y> <X> 
+ Translate from [TGT] to [SRC]: 
+ <Y> <X> 
+ Compile to [TGT]: 
+ <Y> <X> 
+ [SRC]: <Y> <X> is not equivalent to <Y> <X> 
+ can be summarized as 
+ <Y> [SRC]: <X> 
+ [SRC]: <Y> 37.77 38.18 37.39 27.86 23.50 37.46 19.03 26.43 26.21 26.35 16.69 3.92 26.24 8.21 29.53 29.85 29.68 24.41 16.90 29.42 15.96 20.99 20.35 19.91 18.16 7.80 22.62 6.37 17.72 17.75 17.52 11.98 8.06 17.68 7.57 17.27 16.63 16.15 12.60 9.23 17.15 4.40 24.95 24.83 24.50 18.62 11.57 25.10 10.26
+Table 3: Translation performance (BLEU) of using different templates for in-context learning. The number of in-context exemplars is fixed at eight in this experiment. Ã¢Â€Âœ<X>Ã¢Â€Â and Ã¢Â€Âœ<Y>Ã¢Â€Â denote the placeholder for source and target sentence respectively. Ã¢Â€Âœ[SRC]Ã¢Â€Â and Ã¢Â€Âœ[TGT]Ã¢Â€Â represent the placeholder for source and target language name in English. Bold text denotes the highest score along the column.
+38 Deu-Eng: 37.37 SweEng Pus'Eng Ã¢Â€Â”Fin-ÃƒÂ©ng Ã¢Â€Â”ind-Eng Ã¢Â€Â”TurEng Swe-tng PorÃƒÂ©ng Cateng Rus-Deu Mya-DeuZho-Dew "Tanslation Direction of Cross ingual Exemplars suv hn pueu
+with a few non-English monolingual resources (less than 1% of English resources). Even on unseen languages, e.g., Occitan (oci) and Asturian (ast), XGLM can translate through ICL. These observa- tions indicate a potential advantage of the novel translation paradigm: LLM can learn to translate in a resource-efficient way.
+# 5.2 Findings on In-context Template
+The good performance of LLMs relies on carefully-designed template The initial step of applying in-context learning for translation is de- termining the template. We find that the trans- lation performance varies greatly with different templates (Table 3), where the largest gap in the average performance is up to 16 BLEU. The best template for each direction is also different. Among these templates, Ã¢Â€Âœ<X>=<Y>Ã¢Â€Â achieves the highest average BLEU score. Ã¢Â€Âœ[SRC]: <X> 
+ [TGT]: <Y>Ã¢Â€Â achieves the lowest score, al- though it is a commonly-used template for prompt- ing other LLMs, e.g., PaLM (Vilar et al., 2022), GLM (Zhang et al., 2023). Such phenomena indi- cate that the template plays a vital role in ICL and it may be challenging to design a universally op- timal template for different LLMs and translation directions.
+Even unreasonable template can instruct LLM to generate decent translation A common intu- ition of ICL is that the template instructs LLMs to do the target task (Brown et al., 2020), e.g., the template Ã¢Â€Âœ<X> can be translated to <Y>Ã¢Â€Â in- structs the LLM to perform translation task. How-
+Figure 5: Effects of using cross-lingual exemplars.
+ever, we find that wrapping translation exemplars with task-unrelated template can also serve as an effective prompt. For example, the template like Ã¢Â€Âœ<X> can be summarized as <Y>Ã¢Â€Â can also in- struct LLM to generate translation, rather than guid- ing it to generate summarization. Given the fact that these unreasonable template are also effective, the community may not fully understand the role of in-context-template.
+# 5.3 Findings on In-context Exemplar
+Cross-lingual exemplars help for certain trans- lation directions Translation direction of the ex- emplar is a unique factor in machine translation. We find that using cross-lingual exemplars does not always causes worse performance and show two cases in Figure 5. When using cross-lingual exem- plars for German-English translation, the transla- tion performance degenerates. But when using cross-lingual exemplars for low-resource Chinese-
+In-context Exemplars | Consistency Granularity Diversity | Deu-Eng Eng-Deu Mismatched Translation xK v v 0.00 0.00 Word-level Translation v xK v 25.10 5.84 Doc-level Translation v xK v 8.01 2.05 Duplicated Translation v v x 35.12 19.66 Sent-level Translation | v v Ã‚Â¥ | 37.37 26.49
+Table 4: Translation performance of XGLM when using different contents as in-context exemplars. Ã¢Â€ÂœConsistencyÃ¢Â€Â column denotes whether source and target sentence are semantically consistent. Ã¢Â€ÂœGranularityÃ¢Â€Â column denotes whether the exemplar is a sentence-level pair. Ã¢Â€ÂœDiversityÃ¢Â€Â column denotes whether exemplars in the context are different from each other.
+Eng-Deu 38.5 38.0 375 Baro BLEU 36.5 36.0 35.5 16 16 Exemplar Number Exemplar Number Rus-Eng Eng-Rus 300 295 3290 Bogs 280 as Ra 6 16 2 Ra 6 16 2 Exemplar Number Exemplar Number Rus-Deu Deu-Rus BLEU BLEU Bae 2 Ba 6 a6 2 16 Exemplar Number Exemplar Number == Random BM25-Ã‚Â©- Topk -e- Oracle
+Figure 6: Effects of selecting varying number of in- context exemplars according to different strategies.
+English translation (illustrated in Appendix D), XGLMÃ¢Â€Â™s translation performance usually improves significantly, even when both source and target language is changed. This phenomenon indicates the potential usage of cross-lingual exemplars in a broader range of tasks (Lin et al., 2022), and we will explore more about this in the future.
+Rev ratio Deu-Eng Head Tail Eng-Deu Head Tail 0 / 8 1 / 8 2 / 8 3 / 8 4 / 8 5 / 8 6 / 8 7 / 8 8 / 8 37.37 37.74 37.29 36.82 36.60 35.61 30.49 14.60 3.42 37.37 36.05 36.79 35.67 35.18 31.93 20.71 5.36 3.42 26.49 26.75 26.89 26.44 26.23 25.58 22.42 12.51 3.10 26.49 23.96 24.66 24.34 22.17 17.47 8.73 3.19 3.10
+Table 5: Effects of reversing in-context examplesÃ¢Â€Â™ trans- lation direction. Ã¢Â€ÂœRev ratioÃ¢Â€Â means the number of exem- plars that are reversed. Ã¢Â€ÂœHeadÃ¢Â€Â and Ã¢Â€ÂœTailÃ¢Â€Â represents re- versing the exemplars in the head and tail of the prompt respectively.
+namely Random11, BM2512, TopK13 and Oracle14. Effects of selecting varying number of in-context exemplars with different approaches are shown in Figure 6. The general trend in all dataset is simi- lar. As the number of examples grows from 1 to 8, the BLEU score increases rapidly. Afterwards, the translation performance plateaus regardless of se- lection strategy. When more exemplars are added, e.g., 32 exemplars, the BLEU score usually starts to decline, shows an opposite phenomenon against the observation in natural language understanding tasks (Li et al., 2023).
+Compared to semantically-related exemplars, randomly-picked exemplars gives comparable translation performance. Even the performance of oracle selection is on par with random selection. Based on these observations, we suggest that trans- lation exemplars can teach LLM to translate but
+Semantically-related exemplars does not brings more benefits than randomly-picked exemplars In this paper, we use development set for exemplar selection, which has been found to be a high-quality candidate pool (Vilar et al., 2022), and we com- pare four ways of selecting in-context exemplars,
+11Random: picking exemplars on a random basis. 12BM25: selecting exemplars whose source sentences are similar to the test caseÃ¢Â€Â™s source sentence according to BM25. 13TopK: selecting exemplars whose source sentences are similar to the test caseÃ¢Â€Â™s source sentence according to the similarity of sentence embedding.
+14Oracle: selecting exemplars whose target sentences are similar to the test caseÃ¢Â€Â™s according to sentence embedding, which can be seen as the upper bound of selection strategy.
+LLM may struggle to acquire helpful translation knowledge from semantically-related exemplars.
+Exemplars teach LLM the core feature of trans- lation task To better understand how ICL exem- plars influence LLM to understand the translation task, we observe LLMÃ¢Â€Â™s translation behaviour un- der abnormal in-context exemplars (Table 4).
+We can see that LLM completely fails when mismatched translation is used as exemplars, indi- cating that LLM needs to learn from the context to keep source and target sentence semantically con- sistent. Word-level15 and document-level16 transla- tion exemplar degenerates LLMÃ¢Â€Â™s translation per- formance, which demonstrates that the translation granularity of exemplar matters as well. Another in- teresting phenomenon is that LLM performs worse when duplicated translation is used as the exem- plar, indicating that keeping in-context exemplars diverse is also important. In general, these compar- ison results show that LLM learns the core feature of translation task through in-context learning.
+The exemplar in the tail of the prompt has more impact on the LLMÃ¢Â€Â™s behaviour During our analysis, we find that reversing the translation direc- tion of exemplars will cause LLM to fail. Based on this observation, we conduct experiments to investi- gate the importance of different parts of the prompt (Table 5). We find that reversing exemplars in the tail of the prompt consistently produced worse re- sults compared to reversing exemplars in the head, which suggests that exemplars in the tail of the prompt have larger influence on LLMÃ¢Â€Â™s behavior.
+# 6 Related Work
+In-context learning for machine translation Using LLMs for multilingual machine translation is attracting more and more attention. Lin et al. (2022) evaluate GPT-3 and XGLM-7.5B on 182 directions. Bawden and Yvon (2023) evaluates BLOOM on 30 directions. Bang et al. (2023), Jiao et al. (2023) and Hendy et al. (2023) evaluate ChatGPT on 6 to 18 directions. In this paper, we thoroughly evalu- ate multilingual translation performance of popular LLMs on 102 languages and 606 directions and compare them with state-of-the-art translation en- gines, such as NLLB and Google Translate, which provides a more comprehensive benchmark result
+15We select word pairs from open-source fasttext dictionary. 16We select document translation from Europarl dataset.
+and highlights the challenges involved in optimiz- ing this emerging translation paradigm.
+To find better ICL recipe for machine transla- tion, many efforts have been put into designing exemplars selection strategy (Agrawal et al., 2022; Zhang et al., 2023; Moslem et al., 2023). Similar to the findings of Zhang et al. (2023), we find that random selection is a simple but effective strategy. We also find that even oracle selection can not re- sult in consistently better performance. Wei et al. (2022a) shows few-shot exemplars improve trans- lation performance. But we further demonstrate the dynamic variations of translation performance with the number of in-context exemplars and the usage of cross-lingual exemplars. Besides, Vilar et al. (2022) find that using a high-quality pool, e.g., development set, for ICL example selection is better and Zhang et al. (2023) analyze why the quality of translation exemplars matters. In this paper, we reveal how in-context exemplars teach LLM to translate by analyzing LLMÃ¢Â€Â™s behaviour under different kinds of exemplars.
+Multilingual machine translation Developing a bilingual translation system for each direction be- comes impossible when the number of supporting languages increases. Therefore, multilingual ma- chine translation is proposed (Johnson et al., 2017). But how to build a high-quality yet efficient MMT system remains an on-going challenge (Costa-jussÃƒ  et al., 2022; Yuan et al., 2023; Guerreiro et al., 2023). In this paper, we focus on LLM and reveal its potential in MMT.
+# 7 Conclusion
+In this paper, we evaluate the multilingual transla- tion ability of popular LLMs, including ChatGPT and GPT-4, on 102 languages and 606 directions, which presents the advantages and challenges of LLMs for MMT. We find that translation capabili- ties of LLMs are continually improving and GPT- 4 reaches new performance height. But even for GPT-4, it still face challenge on low-resource lan- guages. In our analysis, we find that LLMs ex- hibit new working patterns when used for MMT. For example, instruction semantics can be ignored during in-context learning and cross-lingual exem- plars can provide better task instruction for low- resource translation. More importantly, we find that LLM can acquire translation ability in a resource- efficient way, which indicates the promising future of LLM in multilingual machine translation.
+# Acknowledgement
+We would like to thank Fei Yuan and Zhenyu Wu for their support to this project. Shujian Huang is the corresponding author. This work is par- tially supported by National Science Foundation of China (No. 62376116, 62176120), the Liaoning Provincial Research Foundation for Basic Research (No. 2022-KF-26-02).
+# References
+Sweta Agrawal, Chunting Zhou, Mike Lewis, Luke Zettlemoyer, and Marjan Ghazvininejad. 2022. In- context examples selection for machine translation. arXiv preprint arXiv:2212.02437.
+Ebtesam Almazrouei, Hamza Alobeidli, Abdulaziz Al- shamsi, Alessandro Cappelli, Ruxandra Cojocaru, Merouane Debbah, Etienne Goffinet, Daniel Heslow, Julien Launay, Quentin Malartic, et al. 2023. Falcon- 40b: an open large language model with state-of- the-art performance, 2023. URL https://huggingface. co/tiiuae/falcon-40b.
+Yejin Bang, Samuel Cahyawijaya, Nayeon Lee, Wen- liang Dai, Dan Su, Bryan Wilie, Holy Lovenia, Ziwei Ji, Tiezheng Yu, Willy Chung, et al. 2023. A multi- task, multilingual, multimodal evaluation of chatgpt on reasoning, hallucination, and interactivity. arXiv preprint arXiv:2302.04023.
+Rachel Bawden and FranÃƒÂ§ois Yvon. 2023. Investigating the translation performance of a large multilingual language model: the case of bloom. arXiv preprint arXiv:2303.01911.
+Yoshua Bengio, RÃƒÂ©jean Ducharme, and Pascal Vincent. 2000. A neural probabilistic language model. Ad- vances in Neural Information Processing Systems (NeurIPS).
+Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. 2020. Language models are few-shot learners. Advances in Neural Information Processing Systems (NeurIPS).
+Marta R Costa-jussÃƒ , James Cross, Onur ÃƒÂ‡elebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, et al. 2022. No language left behind: Scaling human-centered machine translation. arXiv preprint arXiv:2207.04672.
+Qingxiu Dong, Lei Li, Damai Dai, Ce Zheng, Zhiyong Wu, Baobao Chang, Xu Sun, Jingjing Xu, Lei Li, and Zhifang Sui. 2022. A survey for in-context learning. arXiv preprint arXiv:2301.00234.
+Aparna Elangovan, Jiayuan He, and Karin Verspoor. 2021. Memorization vs. generalization : Quantify- ing data leakage in NLP performance evaluation. In
+Proceedings of the Conference of the European Chap- ter of the Association for Computational Linguistics (EACL).
+Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, et al. 2021. Beyond english-centric multi- lingual machine translation. The Journal of Machine Learning Research (JMLR).
+Xavier Garcia, Yamini Bansal, Colin Cherry, George Foster, Maxim Krikun, Fangxiaoyu Feng, Melvin Johnson, and Orhan Firat. 2023. The unreasonable effectiveness of few-shot learning for machine trans- lation. arXiv preprint arXiv:2302.01398.
+Naman Goyal, Cynthia Gao, Vishrav Chaudhary, Peng- Jen Chen, Guillaume Wenzek, Da Ju, Sanjana Kr- ishnan, MarcÃ¢Â€Â™Aurelio Ranzato, Francisco GuzmÃƒÂ¡n, and Angela Fan. 2022. The Flores-101 evaluation benchmark for low-resource and multilingual ma- chine translation. Transactions of the Association for Computational Linguistics (TACL).
+Nuno M Guerreiro, Duarte Alves, Jonas Waldendorf, Barry Haddow, Alexandra Birch, Pierre Colombo, and AndrÃƒÂ© FT Martins. 2023. Hallucinations in large multilingual translation models. arXiv preprint arXiv:2303.16104.
+Amr Hendy, Mohamed Abdelrehim, Amr Sharaf, Vikas Raunak, Mohamed Gabr, Hitokazu Matsushita, Young Jin Kim, Mohamed Afify, and Hany Hassan Awadalla. 2023. How good are gpt models at ma- chine translation? a comprehensive evaluation. arXiv preprint arXiv:2302.09210.
+Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. 2022. An empirical analy- sis of compute-optimal large language model training. Advances in Neural Information Processing Systems (NeurIPS).
+Wenxiang Jiao, Wenxuan Wang, JT Huang, Xing Wang, and ZP Tu. 2023. Is chatgpt a good trans- lator? yes with gpt-4 as the engine. arXiv preprint arXiv:2301.08745.
+Melvin Johnson, Mike Schuster, Quoc V. Le, Maxim Krikun, Yonghui Wu, Zhifeng Chen, Nikhil Thorat, Fernanda ViÃƒÂ©gas, Martin Wattenberg, Greg Corrado, Macduff Hughes, and Jeffrey Dean. 2017. GoogleÃ¢Â€Â™s multilingual neural machine translation system: En- abling zero-shot translation. Transactions of the As- sociation for Computational Linguistics (TACL).
+Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361.
+Urvashi Khandelwal, Omer Levy, Dan Jurafsky, Luke Zettlemoyer, and Mike Lewis. 2020. Generalization through memorization: Nearest neighbor language models. In International Conference on Learning Representations (ICLR).
+Mukai Li, Shansan Gong, Jiangtao Feng, Yiheng Xu, Jun Zhang, Zhiyong Wu, and Lingpeng Kong. 2023. In-context learning with many demonstration exam- ples. arXiv preprint arXiv:2302.04931.
+Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Na- man Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian OÃ¢Â€Â™Horo, Jeff Wang, Luke Zettle- moyer, Zornitsa Kozareva, Mona Diab, Veselin Stoy- anov, and Xian hLi. 2022. Few-shot learning with In Pro- multilingual generative language models. ceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP).
+TomÃƒÂ¡Ã…Â¡ Mikolov, Martin KarafiÃƒÂ¡t, LukÃƒÂ¡Ã…Â¡ Burget, Jan Ã‹Â‡Cernock`y, and Sanjeev Khudanpur. 2010. Recurrent neural network based language model. Interspeech.
+Yasmin Moslem, Rejwanul Haque, and Andy Way. 2023. Adaptive machine translation with large language models. arXiv preprint arXiv:2301.13294.
+OpenAI. 2022. https://openai.com/blog/chatgpt.
+OpenAI. 2023. Gpt-4 technical report.
+Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. 2019. Language models are unsupervised multitask learners.
+Ricardo Rei, Craig Stewart, Ana C Farinha, and Alon Lavie. 2020. COMET: A neural framework for MT In Proceedings of Conference on Em- evaluation. pirical Methods in Natural Language Processing (EMNLP).
+Xiaozhe Ren, Pingyi Zhou, Xinfan Meng, Xinjing Huang, Yadao Wang, Weichao Wang, Pengfei Li, Xiaoda Zhang, Alexander Podolskiy, Grigory Arshi- nov, Andrey Bout, Irina Piontkovskaya, Jiansheng Wei, Xin Jiang, Teng Su, Qun Liu, and Jun Yao. 2023. Pangu-sigma: Towards trillion parameter language model with sparse heterogeneous computing. arXiv preprint arXiv:2303.10845.
+Teven Le Scao, Angela Fan, Christopher Akiki, El- lie Pavlick, Suzana IliÃ‚Â´c, Daniel Hesslow, Roman CastagnÃƒÂ©, Alexandra Sasha Luccioni, FranÃƒÂ§ois Yvon, Matthias GallÃƒÂ©, et al. 2022. Bloom: A 176b- parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100.
+Hugo Touvron, Louis Martin, Kevin Stone, Peter Al- bert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et al. 2023. Llama 2: Open founda- tion and fine-tuned chat models. arXiv preprint arXiv:2307.09288.
+Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Pro- cessing Systems (NeurIPS).
+David Vilar, Markus Freitag, Colin Cherry, Jiaming Luo, Viresh Ratnakar, and George Foster. 2022. Prompt- ing palm for translation: Assessing strategies and performance. arXiv preprint arXiv:2211.09102.
+Jason Wei, Maarten Bosma, Vincent Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. 2022a. Finetuned language models are zero-shot learners. In International Con- ference on Learning Representations (ICLR).
+Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yogatama, Maarten Bosma, Denny Zhou, Donald Metzler, et al. 2022b. Emergent abilities of large language models. arXiv preprint arXiv:2206.07682.
+Jerry W. Wei, Jason Wei, Yi Tay, Dustin Tran, Al- bert Webson, Yifeng Lu, Xinyun Chen, Hanxiao Liu, Da Huang, Denny Zhou, and Tengyu Ma. 2023. Larger language models do in-context learning dif- ferently. CoRR, abs/2303.03846.
+Zhenyu Wu, YaoXiang Wang, Jiacheng Ye, Jiangtao Feng, Jingjing Xu, Yu Qiao, and Zhiyong Wu. 2023. Openicl: An open-source framework for in-context learning. arXiv preprint arXiv:2303.02913.
+Wenda Xu, Xian Qian, Mingxuan Wang, Lei Li, and William Yang Wang. 2022a. Sescore2: Retrieval augmented pretraining for text generation evaluation. arXiv preprint arXiv:2212.09305.
+Wenda Xu, Yi-Lin Tuan, Yujie Lu, Michael Saxon, Lei Li, and William Yang Wang. 2022b. Not all errors are equal: Learning text generation metrics using strati- fied error synthesis. In Findings of the Association for Computational Linguistics: EMNLP 2022.
+Fei Yuan, Yinquan Lu, Wenhao Zhu, Lingpeng Kong, Lei Li, Yu Qiao, and Jingjing Xu. 2023. Lego-mt: Towards detachable models in massively multilingual machine translation. In Findings of the Association for Computational Linguistics: ACL 2023.
+Biao Zhang, Barry Haddow, and Alexandra Birch. 2023. Prompting large language model for machine transla- tion: A case study. arXiv preprint arXiv:2301.07069.
+Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher De- wan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068.
+# A Evaluating LLMÃ¢Â€Â™s translation performance with SEScore
+Table 6 presents average SEScore of LLMs on different language families. Currently, SEScore mainly supports evaluating English translation. Thus we evaluate LLMÃ¢Â€Â™s performance on trans- lating other languages to English.
+# B Detailed Results on Each Language
+We report detailed results of our evaluated mod- els in Table 7 (BLEU), Table 8 (COMET), Table 9 (SEScore) and Figure 8. One thing that needs to be mentioned is that BLEU supports all transla- tion directions, whereas COMET and SEScore only support a subset of these translation directions.
+# C Lists of Language
+We evaluate 102 languages in this paper. Table 10 lists the name, ISO code and language family of these languages.
+# D Cross-lingual Exemplars
+In Figure 5, we show an example of using cross- lingual in-context exemplars (Russian-English ex-
+emplars for Chinese-English translation).
+[Input]
+ror dbunbM c ysacruem Paiiana Focamura u Duwi Croyn TOY HOMMHALLHH BO BCeX T1aBHBIX KaTeropHax.=The movie, featuring Ryan Gosling and Emma Stone, received nominations in all major categories.
+"Tenepp y Hac ecTs ueTEIpÃƒÂ©XMecHIHbIe MBILLIH, y KOTOPEIX Sonbute HeT WHaGeTa", Ã¢Â€Â” ,oGaBH OH.="We now have 4- month-old mice that are non-diabetic that used to be diabetic," he added.
+Tocnmar 1 Croyn nony4ian HomuHaqun Ha syamero akrepa 1 akpicy coorsercrsenHo.=Gosling and Stone received nominations for Best Actor and Actress respectively.
+Haxoska TaKxKe NO3BONAeT O3HAKOMHTECA C 9BOMOUHell NepbeB y nrnu.=The find also grants insight into the evolution of feathers in birds.
+Kanuenapua ryGepuaropa coobmuua, 170 19 13 panenHprx 6s11H ospHuepamn noaMMHH.=The governor's office said nineteen of the injured were police officers.
+Crangapr 802.1 1n paGoraer Ha oGoux sactorax Ã¢Â€Â” 2.4 [Tun 5.0 'Tu.=The 802.1 1n standard operates on both the 2.4Ghz and 5.0Ghz frequencies.
+On ckasaz, 470 cozgaa ABepHoii 3B0HOK, paGoraiomunti oF WiFi.=He built a WiFi door bell, he said.
+B konue 2017 roxa Cummaodip nospuiica Ha ToproBom TeneKanane QVC.=In late 2017, Siminoff appeared on shopping television channel QVC.
+PBL GED IE AAT HA AE HAIN Ta] (GMT) 4K 12 pipers TF Wt. =
+[Output]
+The Iraqi research team submitted a report at Greenwich time (GMT) today at 12 noon.
+Figure 7: An example of using cross-lingual in-context exemplars
+Language Family Direction XGLM-7.5B OPT-175B Translation Performance (SEScore) Falcon-7B LLaMA-7B LLaMA-7B-Chat ChatGPT GPT4 M2M-12B NLLB-1.3B Google Indo-Euro-Germanic (8) Indo-Euro-Romance (8) Indo-Euro-Slavic (12) Indo-Euro-Indo-Aryan (10) Indo-Euro-Other (11) Austronesian (6) Atlantic-Congo (14) Afro-Asiatic (6) Turkic (5) Dravidian (4) Sino-Tibetan (3) Other (14) XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng XÃ¢Â‡Â’Eng -11.78 -6.54 -14.29 -16.45 -18.36 -14.06 -19.42 -18.85 -17.15 -16.52 -19.41 -16.74 -6.00 -4.01 -10.31 -22.15 -17.81 -10.08 -17.61 -18.91 -16.99 -22.58 -15.20 -16.56 -8.34 -5.57 -13.46 -21.65 -18.09 -12.30 -18.44 -19.17 -18.66 -21.91 -12.37 -18.70 -5.41 -3.72 -5.11 -17.15 -13.61 -9.61 -17.59 -16.61 -15.50 -20.18 -11.33 -13.05 -5.90 -4.14 -5.75 -19.46 -15.42 -10.48 -18.48 -17.66 -16.47 -21.96 -12.01 -14.17 -2.52 -2.30 -3.55 -7.64 -6.74 -4.48 -12.38 -12.16 -7.63 -9.26 -10.43 -8.51 -2.16 -2.08 -3.17 -4.69 -4.62 -3.03 -9.34 -8.28 -5.50 -5.35 -6.79 -6.07 -3.15 -3.08 -4.21 -11.77 -7.57 -5.37 -14.16 -14.41 -15.29 -13.69 -11.93 -6.91 -2.78 -2.54 -3.70 -3.53 -3.75 -3.47 -6.88 -4.46 -4.89 -3.76 -5.50 -4.94 -1.85 -2.12 -2.80 -2.80 -4.40 -2.56 -5.75 -3.49 -3.93 -3.07 -4.30 -3.80
+Table 6: Average SEScore of LLMs on different language families. The number in the bracket indicates the number of evaluated languages in the specific language family. Bold text denotes the highest SEScore across models. Underlined text denotes the highest SEScore across LLMs.
+Language Family Language XGLM-7.5B OPT-175B XÃ¢Â‡Â’Eng (BLEU) Falcon-7B LLaMA2-7B LLaMA2-7B-Chat ChatGPT GPT4 M2M-12B NLLB-1.3B Google XGLM-7.5B OPT-175B EngÃ¢Â‡Â’X (BLEU) Falcon-7B LLaMA2-7B LLaMA2-7B-Chat Indo-European-Germanic (8) afr dan nld deu isl ltz nob swe 16.34 20.65 17.78 34.03 5.65 14.13 17.19 22.54 48.49 43.54 31.25 39.15 12.68 17.96 39.45 44.67 34.73 35.31 26.87 34.60 8.18 13.60 28.38 37.30 47.89 48.33 34.46 41.94 15.41 21.87 41.91 46.47 42.89 45.83 33.03 39.44 12.28 18.36 42.08 44.62 59.28 51.23 38.10 43.56 32.98 44.57 46.62 50.32 62.65 53.18 38.60 47.04 37.58 49.20 48.51 51.34 52.86 48.32 34.52 42.79 29.47 40.04 45.38 48.37 57.76 52.35 38.68 44.79 35.07 50.37 43.76 49.50 63.15 56.44 39.66 48.52 43.19 52.52 49.94 55.86 5.56 7.91 7.64 25.44 1.40 4.74 8.55 12.04 20.75 26.81 21.38 23.38 3.10 5.54 23.18 27.00 14.45 14.80 16.69 20.65 2.77 5.10 12.90 18.12 22.98 32.79 24.89 30.46 5.13 6.32 26.01 33.69 20.42 28.19 20.80 26.01 5.53 5.72 20.35 28.49 42.18 45.49 32.57 41.02 21.26 24.65 35.44 48.09 48.02 47.46 34.66 44.69 27.89 33.89 39.10 49.39 41.41 45.12 31.79 40.18 27.80 28.04 37.09 47.02 43.39 43.81 32.93 40.20 31.04 35.08 36.33 45.00 Average 18.54 34.65 27.37 37.28 34.82 45.83 48.51 42.72 46.54 51.16 9.16 18.89 13.19 22.78 19.44 36.34 40.64 37.30 38.47 Indo-European-Romance (8) ast cat fra glg oci por ron spa 27.65 38.33 36.81 29.93 35.27 41.67 11.27 27.98 32.20 41.45 43.02 36.57 41.41 44.64 41.33 30.81 28.84 27.52 41.62 29.30 36.11 44.49 34.49 30.13 33.88 44.48 44.11 37.98 42.89 48.14 44.24 33.09 30.90 40.97 41.15 35.43 37.45 45.47 40.83 30.51 43.18 47.04 46.13 43.33 51.86 53.09 47.31 33.48 46.41 49.10 48.81 42.18 57.73 52.81 47.53 33.76 39.06 44.21 43.99 38.13 48.03 48.76 45.87 30.63 41.65 48.72 46.23 45.12 56.93 51.20 47.85 32.91 -1.00 52.46 50.68 44.18 -1.00 52.68 53.18 34.36 12.70 34.10 36.49 12.60 13.20 36.83 5.85 23.82 13.11 23.49 37.97 18.53 8.90 37.72 31.35 23.35 10.96 13.95 43.87 12.30 7.60 34.62 14.97 21.93 12.89 36.18 42.86 16.07 12.76 42.85 33.08 25.83 11.24 35.31 39.60 14.38 11.62 38.70 28.31 24.84 28.24 46.33 55.71 38.07 30.33 53.95 45.87 32.31 35.45 48.34 56.80 39.54 40.20 55.89 47.62 31.88 33.43 48.49 53.59 38.29 39.40 53.75 47.99 28.93 34.01 48.79 55.73 37.11 44.45 52.29 43.42 32.08 Average 24.83 36.79 30.72 39.19 36.33 45.76 47.90 42.53 46.43 43.43 15.55 21.60 16.61 25.30 22.47 38.84 42.55 40.14 40.98 Indo-European-Slavic (12) bel bos bul hrv ces mkd pol rus srp slk slv ukr 1.98 7.88 34.48 6.66 8.84 21.00 7.46 27.83 11.56 7.15 6.67 16.95 4.48 34.37 11.48 33.37 32.26 8.32 28.63 18.80 6.57 30.21 25.64 15.80 1.88 21.26 8.07 19.48 22.03 5.63 23.95 14.26 4.70 16.86 13.08 6.63 12.85 39.24 38.18 36.35 39.44 33.36 33.02 33.44 36.97 31.50 33.26 40.37 9.48 37.13 34.32 34.68 35.74 27.81 31.44 31.92 33.34 29.03 29.52 36.89 23.71 44.86 41.65 40.02 43.25 41.76 34.31 38.04 40.71 40.92 39.04 42.95 25.12 48.34 44.97 40.42 42.08 44.36 38.12 38.75 44.09 43.13 39.70 45.16 15.62 41.24 40.50 36.28 41.87 39.59 32.65 32.73 37.56 38.57 35.88 37.89 26.00 44.47 41.60 37.62 41.42 44.34 34.27 38.60 41.40 41.28 37.73 41.97 27.03 49.75 48.32 42.60 47.00 49.21 37.74 40.09 46.75 45.71 41.69 47.44 0.31 1.97 31.53 1.44 2.54 5.97 2.02 23.18 1.55 2.54 1.71 2.04 0.35 18.05 2.83 15.71 15.47 1.52 14.15 6.48 0.86 10.24 9.10 3.38 0.39 7.41 3.11 6.19 8.09 2.06 7.96 3.49 1.30 5.80 4.78 1.49 3.39 23.37 26.38 21.96 27.30 12.80 20.79 25.54 24.58 13.66 17.98 25.17 1.89 18.71 20.13 17.66 21.73 8.58 17.93 21.50 19.85 10.30 16.37 19.08 16.95 34.44 40.78 31.90 35.22 34.94 30.16 36.45 30.39 32.48 32.04 35.53 20.13 37.52 42.02 37.84 39.72 36.69 32.27 37.71 36.18 38.78 36.03 37.87 13.59 33.78 49.44 32.54 37.21 42.38 29.26 39.69 30.00 37.84 36.89 37.54 24.55 37.77 46.38 34.94 38.62 42.31 29.67 37.86 35.35 38.73 34.77 37.80 Average 19.84 29.95 23.19 36.97 34.02 42.97 45.02 39.67 43.34 43.51 11.63 15.85 11.35 23.13 19.76 36.17 39.77 37.94 39.09 Indo-European-Indo-Aryan (10) asm ben guj hin mar npi ory pan snd urd 4.18 19.84 0.21 26.99 5.63 8.47 0.31 0.13 1.70 19.31 1.11 1.12 1.06 1.17 0.87 2.31 0.82 1.09 1.72 0.74 1.17 1.66 1.65 1.26 1.00 3.17 1.14 1.17 0.65 1.09 3.82 6.72 1.49 21.04 7.37 9.88 1.35 2.09 4.27 8.76 1.27 2.71 1.61 14.89 4.78 6.62 1.33 1.46 3.25 4.95 18.58 24.63 22.78 38.15 26.94 28.83 17.83 28.65 17.29 29.53 27.47 34.23 36.44 45.88 37.08 45.25 33.07 42.28 31.53 39.72 -1.00 30.60 0.90 40.72 27.29 19.00 0.64 24.92 8.31 23.94 32.32 36.97 41.76 45.83 39.25 44.01 39.02 44.34 43.32 40.67 35.35 43.37 45.97 53.17 46.02 51.91 42.00 49.86 46.23 42.69 0.42 11.27 0.03 18.81 1.58 1.63 0.01 0.06 0.20 13.63 0.05 0.03 0.02 0.42 0.06 0.12 0.06 0.06 0.39 0.20 0.05 0.11 0.04 0.27 0.07 0.14 0.02 0.01 0.31 0.29 0.21 2.09 0.21 5.84 2.17 2.14 0.05 0.21 0.82 2.37 0.07 0.78 0.11 5.18 1.83 1.65 0.02 0.17 0.60 2.03 9.08 18.65 18.05 32.44 12.22 16.16 10.70 21.38 8.75 17.58 12.74 24.74 20.65 35.30 17.13 22.73 18.12 25.73 14.97 21.43 -1.00 28.39 7.32 40.54 18.27 4.08 0.60 14.85 13.15 18.17 26.02 34.31 38.37 44.97 27.66 30.96 32.57 41.57 34.34 29.65 Average 16.91 22.38 17.45 29.00 26.20 38.33 42.99 33.85 42.66 44.07 9.82 11.71 8.40 17.47 14.89 30.99 34.92 31.76 37.76 Indo-European-Other (11) hye ell gle cym ita lav lit pus fas ckb tgk 0.15 27.54 4.02 4.27 31.17 2.69 2.90 1.56 3.79 0.34 2.06 0.32 9.42 10.49 10.74 32.71 7.00 7.97 1.82 2.01 1.48 1.83 0.74 5.70 8.63 8.46 33.41 4.73 7.60 3.05 2.58 0.84 1.65 3.83 24.18 17.98 18.99 36.30 13.27 12.66 5.03 16.97 2.94 4.84 2.05 17.56 13.61 12.89 35.60 8.75 11.60 4.78 12.42 2.34 4.45 15.30 38.39 37.74 49.92 37.32 33.54 34.34 14.30 35.30 13.39 15.41 32.20 42.36 47.94 60.07 38.85 37.92 37.41 21.46 38.60 24.40 29.01 20.70 35.74 3.24 29.28 34.85 34.06 33.45 24.52 32.29 -1.00 -1.00 39.99 40.41 46.48 53.33 38.69 35.79 33.80 37.97 37.16 -1.00 35.09 45.84 44.84 54.95 63.77 39.15 44.38 41.07 40.35 43.12 2.17 38.88 0.02 21.79 0.50 0.74 25.14 0.19 0.50 0.09 0.45 0.03 0.18 0.05 1.07 1.46 2.66 23.95 1.76 2.08 0.20 0.12 0.11 0.63 0.01 0.51 2.18 3.37 25.79 1.76 2.24 0.18 0.50 0.05 0.63 1.19 2.88 4.34 5.31 27.18 2.92 4.35 0.80 3.90 0.73 1.39 1.53 2.37 4.72 5.20 26.06 2.24 3.48 1.16 3.70 1.07 1.57 9.02 31.12 28.01 44.97 36.39 29.39 25.20 3.92 25.92 5.64 11.33 20.47 32.90 34.93 52.37 37.66 34.34 32.60 6.13 32.98 11.19 17.37 9.89 36.02 0.23 21.91 34.86 35.58 36.08 14.14 30.11 -1.00 -1.00 37.54 34.35 42.37 47.44 36.01 27.75 32.23 22.66 32.92 -1.00 35.83 Average 14.75 19.11 15.12 25.69 22.89 36.36 41.71 31.27 41.20 43.54 8.63 9.78 7.27 14.67 12.63 29.16 33.47 29.05 36.39 Austronesian (6) ceb tgl ind jav msa mri 7.18 9.61 35.82 12.17 29.11 3.29 29.10 35.32 33.73 12.69 33.27 9.48 16.81 22.90 27.85 9.39 28.05 6.71 23.15 32.40 41.10 13.80 37.03 12.73 20.83 28.09 38.97 13.61 35.28 9.54 40.33 49.30 45.33 34.84 46.52 23.39 51.12 53.09 47.54 45.14 51.61 32.34 32.93 36.16 43.08 34.50 45.37 -1.00 48.93 51.78 46.10 45.21 47.62 32.84 57.74 57.79 48.65 50.08 54.68 35.13 1.86 1.97 32.49 3.04 19.15 1.54 8.63 15.27 20.28 3.58 14.40 1.92 6.63 9.80 14.82 4.22 12.62 3.26 9.49 14.25 30.36 7.89 21.17 4.39 9.68 12.39 26.12 7.41 17.87 6.26 26.81 31.58 45.80 18.62 40.13 18.06 31.65 36.43 47.97 24.78 43.49 23.67 24.07 27.83 43.89 26.07 41.31 -1.00 33.96 37.46 46.40 33.54 43.61 28.05 Average 14.91 19.82 15.50 25.80 23.05 36.75 42.27 31.33 41.66 44.31 8.78 9.88 7.41 14.66 12.70 29.27 33.60 28.83 36.47 Atlantic-Congo (14) lug ibo kea kam lin nso nya sna swh umb wol xho yor zul 3.33 1.92 13.65 6.66 5.56 5.05 5.98 3.85 31.78 2.36 5.35 2.56 3.21 2.10 8.12 5.21 26.18 9.85 8.54 8.73 8.88 9.05 11.86 4.94 7.92 7.49 6.05 5.61 6.18 5.36 14.53 7.63 7.11 7.92 7.27 6.76 8.19 3.68 6.42 6.06 6.15 4.43 7.52 7.05 21.66 8.40 7.07 9.25 8.05 8.74 11.79 4.32 8.80 7.72 5.84 6.50 7.75 7.33 21.07 10.84 8.49 7.84 9.29 8.69 9.41 4.86 7.64 8.66 7.15 7.10 14.11 12.99 44.40 14.87 13.51 18.61 20.21 14.27 49.29 8.44 12.47 20.69 12.35 21.89 23.40 19.79 53.06 16.02 17.88 35.60 28.84 25.25 53.27 11.83 15.82 36.15 22.08 36.77 7.19 16.28 -1.00 -1.00 8.88 11.39 -1.00 -1.00 42.13 -1.00 10.16 26.94 6.27 23.45 27.17 31.05 49.77 19.23 28.61 42.65 31.37 31.16 47.58 14.87 22.82 39.66 25.39 39.49 29.91 34.50 -1.00 -1.00 29.85 -1.00 33.87 31.69 56.98 -1.00 -1.00 45.45 26.23 46.21 0.53 0.51 4.27 1.05 1.14 0.76 0.80 0.73 21.03 0.23 0.92 1.37 0.78 1.13 0.54 1.09 5.94 1.26 1.36 1.32 1.60 1.14 2.27 0.68 1.67 1.37 1.05 1.10 1.11 2.32 4.97 1.61 1.54 1.08 1.45 1.48 2.30 0.69 1.78 2.89 1.48 1.75 1.77 1.82 6.46 1.85 1.94 2.35 2.69 1.61 3.31 0.98 3.36 2.61 2.04 1.55 2.56 2.54 5.38 3.45 3.36 2.66 3.45 3.31 4.39 1.52 3.59 2.59 2.52 1.90 4.61 6.27 14.34 5.37 7.18 8.20 6.87 7.09 37.19 2.32 4.95 7.56 5.16 7.66 5.94 9.99 25.99 6.07 9.67 20.14 11.61 9.82 44.01 3.83 6.57 13.11 8.63 16.36 1.62 13.53 -1.00 -1.00 1.14 5.54 -1.00 -1.00 38.05 -1.00 1.21 16.61 3.82 14.85 15.55 25.60 27.85 8.58 25.93 26.54 23.95 23.32 40.43 4.46 10.73 28.65 14.41 31.87 Average 13.24 17.66 13.77 22.34 20.20 33.32 39.43 27.12 39.74 40.10 7.51 8.20 6.29 12.18 10.75 25.14 29.56 24.31 33.53 Afro-Asiatic (6) amh ara ful mlt orm som 0.29 26.06 4.28 4.90 1.14 3.55 0.45 1.03 7.21 14.75 2.85 9.30 0.93 1.81 6.47 11.83 2.47 5.71 0.94 22.35 6.69 21.92 3.51 7.07 1.63 13.99 8.25 17.68 3.32 7.06 2.97 38.94 10.02 48.08 7.32 17.72 24.14 43.29 13.33 58.72 13.41 29.99 15.75 35.24 6.25 -1.00 -1.00 4.76 32.98 42.05 -1.00 62.54 26.83 32.76 38.99 46.87 -1.00 65.03 30.10 36.85 0.02 9.42 0.72 1.52 0.05 0.70 0.04 0.27 1.62 3.79 0.29 2.38 0.02 0.27 1.61 4.33 0.78 1.39 0.02 4.81 2.61 8.28 0.95 2.68 0.07 3.73 2.69 7.57 1.43 2.94 2.22 32.64 3.11 34.42 1.72 7.31 12.35 36.91 3.89 49.04 2.71 11.25 12.38 31.10 0.42 -1.00 -1.00 5.06 29.12 37.81 -1.00 58.40 12.69 19.45 Average 12.72 16.72 13.06 21.39 19.28 32.32 38.71 25.75 39.18 39.78 7.08 7.65 5.90 11.47 10.14 24.21 28.75 22.99 32.94 Turkic (5) azj kaz kir tur uzb 4.61 3.62 2.37 23.91 2.66 7.01 1.46 1.40 24.39 5.17 3.40 1.63 1.65 10.05 4.00 8.63 6.55 4.83 21.75 5.77 6.56 6.83 5.89 19.93 5.17 24.64 21.74 14.49 38.14 24.21 27.80 30.65 21.31 43.43 35.45 9.33 3.81 -1.00 36.76 2.37 28.45 34.85 26.00 39.42 35.89 31.77 41.16 30.85 43.49 41.63 1.12 0.23 0.24 14.90 0.90 1.30 0.26 0.27 10.11 0.96 1.67 0.48 0.71 4.56 1.31 2.24 1.26 2.21 8.82 1.88 2.41 1.45 1.74 7.82 2.03 12.97 11.92 8.17 35.05 17.54 15.79 15.62 12.09 37.05 24.26 10.28 13.30 -1.00 29.67 2.07 21.23 31.42 30.39 35.58 32.25 Average 12.39 16.17 12.50 20.65 18.63 31.84 38.27 24.78 38.79 39.66 6.85 7.34 5.64 10.96 9.70 23.77 28.26 22.23 32.76 Dravidian (4) kan mal tam tel 0.14 0.15 14.66 17.22 0.79 0.35 0.77 1.66 0.84 0.74 1.33 1.81 1.83 3.01 3.26 2.51 0.79 1.38 1.88 2.02 23.13 20.79 16.14 20.97 33.48 34.78 29.12 35.02 1.65 26.20 14.19 -1.00 36.89 42.02 36.59 40.79 39.33 46.09 40.74 46.50 0.02 0.04 8.91 12.25 0.03 0.01 0.01 0.01 0.02 0.01 0.00 0.07 0.35 0.97 0.70 0.22 0.25 1.04 0.81 0.23 14.95 11.17 9.86 13.40 19.35 18.23 16.16 20.67 3.34 19.89 5.17 -1.00 37.47 36.18 33.95 41.74 Average 12.18 15.44 11.96 19.79 17.81 31.29 38.03 24.09 38.80 39.83 6.78 6.99 5.37 10.46 9.26 23.23 27.80 21.50 32.98 Sino-Tibetan (3) mya zho_simpl zho_trad 15.07 6.91 6.06 0.18 15.44 12.36 0.84 26.14 22.78 0.80 27.99 26.26 1.18 25.32 24.14 3.50 30.52 30.05 16.01 34.37 32.83 8.02 26.24 -1.00 30.90 31.07 30.67 34.06 37.80 35.18 9.60 15.21 5.63 0.02 3.46 4.22 0.06 20.38 11.78 0.03 20.40 16.30 0.07 15.08 12.02 2.57 33.19 24.01 8.30 33.64 26.49 7.28 24.98 -1.00 18.66 20.93 10.97 Average 12.08 15.23 12.12 19.74 17.78 30.95 37.67 23.64 38.53 39.68 6.89 6.84 5.56 10.52 9.25 23.11 27.63 21.12 32.43 Other (14) est fin hun kat hau heb jpn khm vie kor lao tha luo mon 28.08 25.78 2.32 0.32 2.91 0.40 6.22 1.36 28.19 17.65 1.30 15.30 4.18 1.98 24.01 29.83 22.52 0.84 8.02 1.99 19.38 0.91 18.20 4.11 2.07 1.31 7.18 1.05 6.78 8.01 8.17 1.28 6.18 1.13 14.18 2.71 10.63 2.59 3.53 2.93 5.84 1.20 14.74 32.24 32.46 7.15 6.33 16.29 25.65 5.21 37.33 22.84 3.75 9.24 7.46 3.36 12.30 29.70 28.57 3.48 7.61 9.36 23.45 5.26 32.96 22.03 3.63 8.15 8.58 4.41 40.66 35.90 36.44 12.65 16.85 38.51 30.57 16.03 38.93 28.56 8.81 27.49 13.08 13.73 42.21 40.17 38.58 23.78 32.20 43.97 32.65 31.15 44.83 33.93 21.84 33.17 15.36 22.87 35.47 33.75 35.36 14.25 20.06 37.19 26.85 21.48 38.15 27.05 19.75 27.86 -1.00 21.13 36.78 35.45 35.78 29.94 39.62 41.95 31.67 38.68 42.16 30.55 37.49 33.84 27.48 29.47 44.49 38.99 40.86 33.96 40.67 48.88 36.68 35.33 46.09 35.85 43.33 34.81 -1.00 38.39 20.18 23.45 0.77 0.04 0.38 0.11 17.09 0.20 27.56 9.61 0.05 16.90 1.48 0.11 8.33 11.54 6.97 0.03 1.23 0.16 13.84 0.04 9.45 0.19 0.08 0.03 1.45 0.11 2.71 2.86 4.34 0.01 2.05 0.09 5.38 0.03 4.63 0.30 0.04 0.02 1.56 0.20 5.45 18.57 16.98 2.22 2.06 4.62 21.79 0.01 26.38 11.39 0.00 1.40 2.59 1.25 4.99 14.70 13.27 3.64 3.25 4.62 18.95 0.12 21.71 9.06 0.00 1.83 2.75 1.12 33.71 33.38 27.37 11.13 7.87 29.04 34.61 4.06 41.11 24.41 3.86 21.88 5.61 5.55 38.24 35.33 32.10 16.82 15.44 34.82 35.23 7.70 41.34 26.73 11.07 25.26 6.78 9.69 35.68 33.27 35.89 3.20 13.19 37.14 33.27 14.44 43.24 24.42 16.83 25.47 -1.00 11.07 32.73 29.97 32.27 30.67 31.79 37.57 23.98 15.81 42.37 28.08 32.10 22.25 18.64 21.34 Average 11.75 14.52 11.18 19.22 17.29 30.21 36.97 23.90 38.05 39.30 7.11 6.42 5.03 10.20 8.96 22.72 27.13 21.42 31.89
+Table 7: Detailed results (BLEU) of our evaluated models on 102 languages.
+# Language Family
+# Language
+|
+# XGLM-7.5B OPT-175B
+# XÃ¢Â‡Â’Eng (COMET)
+# Falcon-7B LLaMA2-7B LLaMA2-7B-Chat ChatGPT GPT4 M2M-12B NLLB-1.3B Google XGLM-7.5B OPT-175B
+# EngÃ¢Â‡Â’X (COMET) Falcon-7B LLaMA2-7B LLaMA2-7B-Chat
+# ChatGPT GPT4 M2M-12B NLLB-1.3B Google
+Indo-European-Germanic (7) Average Indo-European-Romance (4) Average Indo-European-Slavic (12) Average Indo-European-Indo-Aryan (10) Average Indo-European-Other (9) Average Austronesian (3) Average Atlantic-Congo (2) Average Afro-Asiatic (4) Average Turkic (5) Average Dravidian (4) Average Sino-Tibetan (3) Average Other (13) afr dan nld deu isl nob swe cat fra glg ron bel bos bul hrv ces mkd pol rus srp slk slv ukr asm ben guj hin mar npi ory pan snd urd hye ell gle cym ita lav lit pus fas ind jav msa swh xho amh ara orm som azj kaz kir tur uzb kan mal tam tel mya zho_simpl zho_trad est fin hun kat hau heb jpn khm vie kor lao tha mon 62.96 72.74 73.32 86.13 50.24 69.85 75.42 86.21 88.46 86.41 87.75 62.09 86.82 88.26 80.54 84.15 83.69 86.71 54.35 81.47 0.87 84.93 89.10 87.42 88.08 66.68 87.55 89.32 83.45 89.04 87.27 88.30 64.91 87.55 89.22 89.87 90.54 88.62 89.39 85.35 89.25 90.32 90.33 90.67 88.64 89.61 87.13 89.47 90.50 87.24 89.74 87.42 88.48 83.43 88.08 89.78 88.36 90.03 88.04 88.98 85.25 86.94 89.67 89.73 90.76 88.39 89.50 87.50 89.07 90.54 44.67 45.95 47.82 80.23 29.53 48.40 54.84 69.54 80.09 81.52 78.06 32.46 81.45 80.67 61.85 62.31 72.40 76.54 32.01 64.72 0.69 72.62 84.14 85.26 82.88 37.53 83.68 86.26 68.30 79.85 82.11 79.99 42.71 80.21 82.20 70.09 83.71 67.40 84.73 84.25 89.05 89.48 87.74 88.18 89.36 50.21 71.97 52.93 76.05 73.63 86.13 86.60 82.41 63.53 86.73 88.73 87.06 88.49 81.35 88.09 83.89 84.28 88.21 88.89 86.19 89.10 87.86 88.67 85.90 88.75 89.53 89.68 89.02 90.22 89.77 89.92 88.92 90.35 87.86 88.61 87.42 89.34 88.91 89.16 88.80 89.08 89.73 89.69 88.70 90.52 83.64 81.81 71.62 39.24 72.95 82.83 76.01 84.51 61.49 84.83 71.58 63.52 83.44 83.97 76.02 84.76 81.46 83.31 75.54 78.37 73.58 85.18 73.58 85.95 85.54 89.25 89.57 87.95 88.47 89.47 57.07 74.55 59.27 78.23 75.82 48.56 57.24 85.87 55.20 59.62 74.16 56.44 84.02 65.14 56.65 56.46 71.47 60.07 85.93 67.92 85.84 84.95 64.07 84.42 73.03 59.20 82.74 80.35 69.11 50.69 74.98 65.12 74.71 79.26 59.09 80.90 72.87 57.21 71.95 0.70 0.61 70.78 87.71 87.27 86.48 87.53 83.56 86.10 86.11 86.46 84.68 85.41 86.68 66.92 87.66 86.11 85.38 86.66 81.27 85.97 86.06 86.03 83.62 85.30 85.81 83.19 89.48 88.34 88.03 88.97 88.06 87.26 87.36 87.91 88.40 88.11 87.75 84.09 89.69 88.95 88.27 89.17 88.73 87.55 87.49 88.51 88.65 88.45 88.22 75.99 87.83 87.89 86.93 87.79 87.17 86.13 86.12 86.59 86.94 86.25 86.00 83.73 88.76 87.61 86.76 88.03 87.59 86.63 86.88 87.24 87.56 87.07 87.13 84.42 89.77 88.78 88.55 89.63 89.08 87.42 87.53 88.15 88.42 88.53 87.84 31.23 30.09 84.67 28.41 29.94 59.72 31.42 81.30 43.18 30.19 28.92 40.25 33.97 77.04 42.27 78.03 65.99 35.44 74.55 58.53 32.69 54.34 52.41 48.07 31.81 45.68 38.19 43.08 50.67 35.24 62.60 46.24 32.46 42.07 0.39 0.37 42.44 83.56 82.37 83.55 82.80 68.26 83.39 83.73 82.08 62.71 77.55 83.14 44.57 75.99 74.36 73.81 75.16 57.43 77.50 78.29 73.98 56.39 73.24 76.28 68.70 79.77 65.11 85.40 84.68 88.46 88.83 86.91 87.75 88.79 49.87 64.06 46.99 77.22 72.66 64.09 83.47 47.25 85.88 62.11 73.64 47.95 47.09 46.69 81.11 48.51 48.51 49.48 51.18 49.64 53.66 45.52 50.42 50.29 46.53 48.71 48.99 51.09 50.17 48.68 55.26 50.42 49.91 48.52 0.48 57.33 66.59 52.61 80.11 65.73 74.31 52.09 52.22 57.19 68.14 55.02 60.13 53.72 77.50 62.18 70.13 52.18 53.45 55.35 63.22 79.55 86.55 85.19 89.20 83.99 87.31 81.20 86.27 76.01 86.08 84.57 88.38 89.27 90.64 87.06 90.88 87.24 89.35 81.96 88.54 - 86.22 38.57 88.76 82.09 75.24 44.71 78.38 51.82 81.15 86.19 88.88 89.95 90.37 88.24 91.22 88.79 89.35 87.15 87.88 86.94 89.71 90.83 90.79 88.84 91.84 89.09 89.84 87.91 88.53 30.33 73.13 23.10 70.40 34.50 39.40 19.94 20.70 23.50 74.95 33.47 32.46 36.95 25.92 26.60 30.51 35.21 31.70 35.43 30.25 31.49 29.99 34.06 26.39 22.67 24.37 32.98 29.26 26.87 0.28 34.05 36.90 38.91 44.57 34.53 37.98 32.16 33.02 29.17 37.90 28.06 31.03 38.29 41.04 33.37 36.34 34.95 33.58 33.65 37.12 67.26 70.56 59.08 78.50 77.29 87.15 88.51 82.06 88.07 88.99 47.18 54.30 40.58 64.71 61.17 39.23 84.35 45.61 47.05 85.44 51.23 50.67 38.23 55.82 45.76 65.85 56.53 59.34 86.66 61.86 61.28 49.79 49.28 45.65 60.60 55.41 0.57 87.07 56.54 59.07 49.46 49.94 55.59 79.99 67.80 67.67 87.65 68.26 66.68 57.78 77.43 54.10 76.96 64.88 63.40 87.48 65.59 66.03 58.45 71.82 76.40 87.76 84.67 87.82 88.52 87.24 87.26 73.69 87.28 85.17 88.33 87.30 89.58 89.02 88.14 87.54 77.52 88.21 75.72 86.92 37.70 70.57 87.27 86.98 86.52 79.36 86.21 88.41 87.48 84.79 87.30 88.31 86.45 85.84 85.55 87.16 89.26 88.26 87.84 90.02 88.82 88.50 87.43 85.99 88.50 24.05 85.01 33.91 30.67 82.89 28.88 26.84 22.74 30.37 35.48 46.81 34.20 32.98 82.61 34.48 33.25 32.01 28.53 32.49 36.63 34.79 0.36 84.83 31.68 34.31 28.86 27.85 33.90 46.75 39.83 39.23 84.62 37.82 40.70 30.35 43.62 34.59 43.10 42.27 38.69 82.76 38.59 40.16 33.93 39.45 64.69 68.21 57.48 76.65 75.22 86.59 88.14 81.05 87.80 88.84 45.77 51.24 39.31 60.29 57.43 86.76 67.78 81.85 84.82 66.26 83.00 82.85 61.53 81.69 87.95 66.81 85.87 87.72 67.84 85.61 89.74 82.77 89.36 90.14 85.53 90.27 88.37 77.56 88.36 89.21 85.71 88.62 89.62 87.10 89.96 85.86 52.22 81.34 77.45 45.55 71.91 71.69 43.42 67.57 85.54 57.62 81.25 82.96 63.86 77.82 65.63 68.86 58.67 76.89 75.57 86.63 88.17 81.31 87.80 88.84 47.60 52.16 40.75 61.26 58.59 81.12 42.89 61.01 54.17 0.58 0.53 61.34 54.10 60.51 55.21 87.71 71.55 88.25 78.59 83.77 69.09 86.24 81.76 88.16 82.72 77.72 32.13 33.58 34.72 0.32 0.37 34.28 34.05 39.07 37.20 65.48 68.38 56.20 76.07 74.81 86.34 87.97 81.09 87.64 88.70 47.91 51.39 39.03 60.10 57.72 44.46 81.55 44.83 47.21 49.38 52.45 51.23 58.55 49.59 54.55 49.82 0.53 49.66 78.26 49.41 53.54 53.22 73.84 51.21 54.80 60.66 87.74 60.41 72.77 81.86 88.10 65.69 79.40 70.20 85.66 - 45.08 86.24 87.38 77.05 81.27 88.17 88.06 78.82 82.84 27.68 69.71 32.88 35.67 45.18 35.39 42.15 44.71 34.54 35.53 40.13 0.37 35.00 55.73 41.67 39.77 40.46 47.25 44.97 40.91 64.62 67.17 54.82 74.63 73.52 85.09 87.25 80.23 87.27 88.37 47.40 50.65 38.13 58.77 56.60 60.35 56.44 53.59 84.23 53.87 67.69 51.68 50.36 83.41 59.24 57.84 53.62 52.88 0.66 0.57 67.91 61.98 59.64 80.46 59.77 66.10 63.24 61.46 79.76 60.19 86.59 81.67 78.36 89.85 83.75 87.48 86.05 82.67 90.33 87.96 61.04 42.15 - 87.89 43.01 87.30 87.40 85.99 88.90 87.97 88.04 88.91 86.77 89.91 89.03 38.01 27.24 24.41 74.73 37.21 32.40 41.62 40.42 69.49 40.09 32.36 33.37 33.53 0.37 0.35 40.51 32.24 34.66 54.42 36.44 43.85 35.51 36.22 53.06 41.21 64.36 66.75 52.88 73.86 72.86 84.99 87.22 78.59 87.30 88.38 46.77 50.12 36.51 57.06 55.29 44.69 44.84 79.12 79.16 43.02 44.51 41.05 47.46 48.00 48.20 0.47 0.50 50.82 54.08 56.12 51.66 50.97 53.89 54.10 52.87 82.92 83.74 79.64 81.71 87.07 88.05 85.59 86.93 39.28 83.55 68.48 - 88.09 89.91 87.24 88.44 88.20 90.74 88.07 89.39 19.74 25.76 76.92 70.17 33.61 31.41 32.42 32.60 29.90 30.71 0.32 0.31 33.98 33.47 34.02 34.64 34.83 36.21 36.85 34.70 64.20 65.23 50.97 72.48 71.53 84.79 87.20 77.80 87.37 88.43 46.86 48.95 35.10 55.52 53.98 79.89 49.06 46.85 41.28 78.34 76.33 48.84 84.94 83.26 51.91 85.94 85.50 52.20 85.69 84.73 61.03 87.31 87.21 77.98 87.88 87.58 57.30 85.40 - 86.86 86.16 86.47 87.42 88.11 87.46 80.82 74.82 66.84 41.01 59.17 64.03 36.02 83.28 80.92 29.51 83.94 84.54 37.45 78.04 79.73 63.93 65.24 51.99 72.57 71.66 84.49 87.07 77.58 87.33 88.39 48.16 49.23 36.61 56.02 54.51 86.04 86.66 42.14 43.25 48.82 40.58 53.32 45.09 83.91 82.74 48.13 75.17 49.71 81.09 86.74 80.15 41.69 56.00 49.75 82.82 39.56 71.02 57.11 46.72 49.20 48.31 58.12 63.92 60.25 46.15 54.60 46.81 79.06 50.21 0.67 54.95 51.74 0.55 48.88 70.45 88.33 86.13 63.34 53.67 70.79 86.41 56.63 86.61 85.39 53.14 69.76 54.32 68.72 87.23 85.86 60.86 55.26 65.39 86.47 57.10 85.51 84.83 54.60 66.38 56.34 89.64 90.24 88.52 76.21 69.81 87.35 88.33 77.48 88.01 88.17 67.62 86.70 74.20 89.72 90.33 88.96 83.55 79.33 88.85 88.86 85.48 89.02 88.69 78.00 88.47 81.49 87.81 89.11 87.14 71.85 66.06 86.68 87.19 72.07 86.74 86.35 73.99 85.58 79.36 87.98 89.16 87.24 86.46 83.02 87.65 88.04 87.13 87.33 87.13 86.31 86.97 84.82 89.94 90.04 88.79 87.93 83.18 89.22 88.55 85.77 88.49 88.66 88.28 86.85 87.39 82.72 86.71 25.15 25.02 36.01 22.30 81.81 23.92 84.30 72.09 28.63 78.17 23.58 57.08 71.23 55.31 32.74 38.35 31.75 78.90 37.39 58.57 35.99 38.31 31.91 41.57 33.17 37.15 33.98 27.44 35.84 27.66 61.42 34.14 0.41 30.50 32.85 0.29 31.63 41.19 83.95 79.78 34.01 34.99 41.54 84.84 28.23 80.27 75.21 31.09 42.34 34.10 43.79 75.89 74.66 37.20 38.42 40.40 82.02 32.00 74.81 68.76 31.31 42.78 35.55 87.00 90.79 88.87 87.95 79.22 89.86 91.09 87.48 90.45 89.03 88.51 83.28 89.86 90.90 85.64 88.89 87.43 85.61 81.51 87.58 88.61 86.56 88.92 88.39 86.26 82.92 88.37 89.74 87.83 88.50 86.47 87.31 88.46 88.61 87.99 90.94 88.33 88.39 87.92 91.11 86.96 86.71 86.82 89.75 87.71 87.66 87.31 89.79 88.25 88.66 86.87 87.60 79.07 89.32 89.00 89.41 90.57 86.85 89.16 89.64 86.57 89.26 87.58 88.41 82.53 90.69 89.98 90.54 91.27 87.12 89.85 89.76 89.03 89.75 89.48 89.78 71.62 88.62 89.53 87.98 88.71 88.96 86.65 87.65 84.72 88.06 86.88 87.71 85.61 90.09 89.85 89.08 90.18 88.86 88.56 88.20 87.75 89.71 88.39 88.57 88.07 88.92 86.64 88.19 66.34 75.85 73.62 76.72 59.24 70.87 60.85 70.62 53.11 77.07 72.13 81.77 78.90 78.61 66.22 77.36 70.37 77.34 56.16 78.66 - 84.03 62.98 79.05 67.98 53.47 40.70 59.40 66.01 74.31 82.67 86.36 87.86 81.60 74.35 81.33 83.72 84.32 80.44 82.83 82.12 84.32 80.64 86.48 52.00 88.13 74.21 84.59 88.56 87.26 88.09 48.69 84.42 69.40 88.73 77.68 86.46 88.91 87.64 89.82 53.48 86.33 66.22 88.48 33.84 70.27 87.24 87.21 87.86 68.49 84.45 89.80 88.31 80.53 85.56 88.05 85.79 88.62 79.48 86.24 81.09 83.60 79.38 86.34 91.42 78.23 89.43 91.58 81.82 89.64 89.34 83.30 87.84 90.47 86.65 89.34 81.44 83.87 79.89 86.51 85.51 65.00 86.04 69.53 83.43 68.15 85.65 74.26 81.18 83.61 79.71 86.23 52.12 86.80 65.94 65.52 71.50 87.05 69.20 74.50 67.68 84.11 - 54.80 85.83 86.73 77.78 81.03 80.11 82.98 79.05 85.96 81.78 66.34 58.30 88.64 78.79 83.37 71.74 63.59 89.53 84.27 78.26 64.63 - 86.10 43.84 87.09 88.95 87.96 88.81 89.52 79.64 82.58 78.23 86.19 69.49 60.57 63.35 65.56 76.68 73.01 76.50 74.40 54.48 77.38 54.90 - 84.95 86.68 88.45 85.19 78.64 82.08 77.37 86.19 50.43 88.88 88.82 65.73 88.73 88.89 64.16 83.49 - 84.68 78.56 78.98 78.52 82.04 77.25 85.93 90.88 92.05 87.77 50.80 58.87 82.26 91.24 48.38 88.37 88.48 43.60 82.92 60.45 92.05 92.42 89.50 65.45 70.78 84.68 90.88 58.96 89.14 89.21 54.32 84.65 72.59 88.69 89.46 87.72 40.66 63.83 86.70 88.02 71.75 87.85 86.21 64.97 82.47 73.11 89.80 89.95 88.59 83.36 80.89 87.04 88.70 79.20 88.06 87.94 83.50 83.03 85.84
+|
+87.25
+|
+48.82
+55.54
+$4.10
+Table 8: Detailed results (COMET) of our evaluated models.
+Language Family Language XGLM-7.5B OPT-175B XÃ¢Â‡Â’Eng (SEScore) Falcon-7B LLaMA2-7B LLaMA2-7B-Chat ChatGPT GPT4 M2M-12B NLLB-1.3B Google Indo-European-Germanic (8) afr dan nld deu isl ltz nob swe -13.42 -10.98 -10.66 -4.44 -20.12 -12.83 -12.37 -9.42 -3.14 -3.18 -4.76 -3.41 -15.12 -11.46 -3.96 -2.96 -6.72 -5.76 -6.31 -4.56 -18.05 -13.68 -7.10 -4.52 -3.59 -2.66 -4.25 -3.17 -13.19 -10.39 -3.60 -2.40 -4.30 -2.68 -4.53 -3.23 -14.49 -11.89 -3.54 -2.57 -0.48 -1.35 -3.66 -2.21 -4.99 -3.14 -2.52 -1.78 -0.19 -1.15 -3.59 -2.04 -4.09 -2.12 -2.33 -1.80 -1.77 -1.98 -4.17 -2.74 -5.59 -4.06 -2.85 -2.02 -1.67 -1.65 -3.59 -2.37 -5.04 -2.01 -3.58 -2.33 Average -11.78 -6.00 -8.34 -5.41 -5.90 -2.52 -2.16 -3.15 -2.78 Indo-European-Romance (8) ast cat fra glg oci por ron spa -6.71 -4.07 -4.30 -6.40 -5.89 -3.53 -15.54 -5.88 -5.74 -3.69 -2.97 -4.17 -4.70 -2.66 -3.15 -5.02 -6.92 -7.42 -3.39 -6.25 -6.00 -3.39 -5.78 -5.41 -5.61 -2.93 -2.87 -4.36 -4.11 -2.35 -2.76 -4.78 -5.95 -3.18 -3.04 -4.80 -5.23 -2.76 -3.06 -5.13 -2.82 -2.00 -2.08 -2.64 -1.48 -1.17 -2.10 -4.14 -2.42 -1.77 -1.82 -2.68 -0.51 -1.39 -1.92 -4.12 -4.02 -2.66 -2.62 -3.32 -2.58 -2.18 -2.35 -4.92 -3.47 -2.14 -2.61 -2.48 -0.88 -1.82 -2.31 -4.57 Average -9.16 -5.01 -6.95 -4.56 -5.02 -2.41 -2.12 -3.11 -2.66 Indo-European-Slavic (12) bel bos bul hrv ces mkd pol rus srp slk slv ukr -20.43 -17.40 -4.33 -18.09 -17.19 -9.69 -17.76 -5.97 -13.95 -17.84 -17.77 -11.07 -17.35 -4.83 -13.68 -5.29 -5.39 -16.10 -5.89 -11.16 -17.28 -6.57 -7.74 -12.44 -19.92 -10.39 -15.29 -10.96 -9.01 -17.54 -7.88 -11.51 -18.10 -11.48 -12.92 -16.56 -12.05 -3.74 -3.58 -4.60 -3.88 -5.61 -4.94 -4.64 -3.88 -5.78 -5.21 -3.39 -13.99 -4.31 -4.49 -4.81 -4.55 -6.86 -5.09 -4.76 -4.43 -6.38 -5.41 -3.94 -6.79 -2.46 -2.80 -3.73 -2.80 -3.02 -4.32 -3.83 -3.05 -3.32 -3.45 -3.01 -6.12 -2.28 -2.25 -3.60 -2.77 -2.34 -3.91 -3.61 -2.56 -2.93 -3.31 -2.40 -9.50 -3.29 -2.78 -3.96 -3.54 -3.28 -4.62 -4.47 -3.70 -3.67 -4.17 -3.57 -6.25 -2.98 -2.93 -4.18 -3.42 -3.04 -4.24 -3.68 -3.24 -3.54 -3.90 -2.97 Average -11.36 -7.28 -9.74 -4.80 -5.34 -2.90 -2.57 -3.59 -3.10 Indo-European-Indo-Aryan (10) asm ben guj hin mar npi ory pan snd urd -17.62 -8.64 -22.60 -6.78 -17.74 -15.26 -22.89 -22.96 -21.45 -8.52 -22.44 -22.29 -22.48 -21.75 -22.28 -21.15 -22.85 -22.04 -21.71 -22.49 -21.75 -21.85 -21.66 -21.65 -21.98 -20.97 -21.75 -21.63 -21.57 -21.67 -19.23 -16.07 -21.04 -9.46 -16.22 -14.08 -21.22 -20.72 -18.93 -14.55 -21.61 -19.59 -22.00 -11.55 -19.49 -17.05 -22.60 -22.01 -21.03 -17.63 -10.07 -6.90 -7.84 -4.05 -7.44 -6.54 -10.30 -6.20 -11.57 -5.52 -7.23 -4.90 -4.68 -2.65 -4.84 -2.94 -5.71 -3.30 -7.19 -3.43 - -5.50 -23.21 -3.56 -6.94 -10.52 -22.74 -8.54 -17.98 -6.98 -5.46 -4.06 -3.36 -2.54 -3.60 -2.58 -3.91 -3.01 -3.25 -3.56 Average -12.70 -11.19 -12.88 -8.05 -9.05 -4.15 -3.13 -5.58 -3.22 Indo-European-Other (11) hye ell gle cym ita lav lit pus fas ckb tgk -23.28 -5.76 -20.94 -20.63 -5.34 -19.87 -20.00 -23.23 -19.32 -22.58 -21.03 -22.38 -14.88 -17.48 -17.06 -4.82 -16.43 -16.62 -21.32 -21.16 -22.60 -21.16 -22.07 -16.79 -17.96 -17.53 -4.90 -17.88 -17.12 -21.25 -20.69 -21.94 -20.90 -18.87 -7.61 -12.56 -12.22 -4.12 -13.19 -13.55 -18.69 -10.03 -20.09 -18.74 -21.02 -9.70 -14.43 -15.38 -3.92 -15.73 -14.81 -20.25 -12.97 -21.42 -20.04 -11.09 -3.66 -4.07 -1.95 -3.60 -4.44 -4.49 -12.83 -4.29 -13.33 -10.41 -5.55 -3.29 -2.30 -0.45 -3.34 -3.61 -3.96 -10.00 -3.54 -8.76 -6.04 -8.90 -4.15 -21.85 -8.77 -4.02 -4.12 -4.27 -7.44 -4.59 - - -3.47 -3.57 -3.03 -1.78 -3.53 -4.38 -4.67 -4.25 -4.17 - -4.64 Average -13.97 -12.68 -14.05 -9.30 -10.48 -4.73 -3.46 -5.97 -3.33 Austronesian (6) ceb tgl ind jav msa mri -18.67 -18.04 -4.84 -14.95 -6.84 -21.00 -9.30 -5.94 -6.02 -14.93 -6.73 -17.56 -13.22 -10.11 -8.15 -16.14 -7.99 -18.19 -11.12 -7.18 -4.01 -14.04 -4.95 -16.36 -12.24 -8.10 -4.23 -14.98 -5.24 -18.08 -4.20 -2.25 -2.56 -6.25 -2.75 -8.88 -2.08 -1.53 -2.33 -3.94 -1.68 -6.64 -8.10 -5.82 -3.13 -6.97 -2.82 - -2.67 -2.43 -2.90 -3.56 -2.74 -6.51 Average -13.98 -12.39 -13.86 -9.33 -10.48 -4.70 -3.42 -5.91 -3.34 Atlantic-Congo (14) lug ibo kea kam lin nso nya sna swh umb wol xho yor zul -20.90 -21.95 -14.56 -19.35 -19.99 -20.19 -20.06 -21.21 -6.38 -21.73 -20.21 -22.28 -20.94 -22.18 -18.15 -19.14 -9.88 -17.92 -17.51 -17.77 -17.96 -18.09 -16.24 -19.15 -17.34 -18.37 -19.45 -19.63 -18.81 -19.17 -13.84 -19.02 -18.27 -18.22 -18.32 -18.80 -17.33 -19.69 -18.81 -19.12 -19.33 -19.41 -18.11 -18.70 -10.94 -18.11 -17.44 -17.61 -18.05 -18.03 -15.75 -19.03 -17.96 -18.18 -19.56 -18.74 -18.81 -19.60 -11.97 -18.57 -18.70 -19.05 -18.51 -19.02 -17.27 -20.22 -18.85 -18.82 -20.00 -19.36 -14.28 -15.44 -3.44 -15.87 -14.65 -13.09 -11.50 -12.83 -2.42 -17.99 -16.39 -10.39 -14.89 -10.17 -10.65 -11.98 -1.64 -14.95 -11.86 -7.08 -8.07 -8.95 -1.59 -17.05 -13.95 -6.29 -11.11 -5.55 -19.72 -12.60 - - -17.78 -17.27 - - -4.08 - -18.38 -8.70 -19.97 -8.98 -8.06 -6.29 -2.92 -10.85 -6.37 -4.41 -6.96 -7.00 -2.85 -12.75 -10.07 -4.49 -8.74 -4.50 Average -15.08 -13.45 -14.79 -11.01 -12.11 -6.26 -4.62 -7.15 -4.07 Afro-Asiatic (6) amh ara ful mlt orm som -22.90 -7.46 -19.87 -19.57 -22.10 -21.21 -22.15 -20.31 -18.57 -14.71 -20.04 -17.68 -21.98 -18.95 -18.88 -15.96 -19.91 -19.32 -21.75 -8.72 -18.57 -11.51 -19.80 -19.31 -21.78 -11.46 -19.21 -12.84 -20.86 -19.78 -19.81 -3.72 -17.33 -2.64 -17.83 -11.63 -8.12 -3.02 -16.43 -0.69 -14.09 -7.32 -12.19 -4.33 -21.80 - - -19.33 -5.34 -3.21 - -0.36 -7.58 -5.79 Average -15.38 -13.89 -15.14 -11.45 -12.55 -6.73 -4.91 -7.60 -4.10 Turkic (5) azj kaz kir tur uzb -18.08 -19.54 -20.36 -7.45 -20.32 -16.01 -20.68 -21.28 -8.08 -18.90 -18.91 -20.30 -20.15 -15.06 -18.89 -15.32 -17.39 -17.95 -9.06 -17.79 -17.02 -17.92 -18.49 -10.21 -18.70 -7.13 -8.76 -11.35 -3.59 -7.34 -6.08 -5.99 -8.38 -2.74 -4.32 -15.70 -20.87 - -4.05 -20.52 -5.94 -4.75 -5.96 -3.84 -3.94 Average -15.50 -14.08 -15.36 -11.71 -12.79 -6.79 -4.95 -8.05 -4.15 Dravidian (4) kan mal tam tel -22.74 -22.96 -10.36 -10.00 -22.73 -22.72 -22.89 -21.96 -22.14 -21.88 -22.09 -21.55 -21.22 -19.81 -18.83 -20.87 -22.52 -22.07 -21.50 -21.75 -8.51 -9.12 -10.17 -9.24 -5.29 -4.77 -6.08 -5.25 -22.71 -6.48 -11.87 - -4.08 -3.15 -4.31 -3.49 Average -15.54 -14.49 -15.67 -12.11 -13.23 -6.91 -4.97 -8.29 -4.13 Sino-Tibetan (3) mya zho_simpl zho_trad -11.16 -23.28 -23.78 -22.86 -10.92 -11.81 -22.11 -7.05 -7.94 -21.43 -6.14 -6.41 -22.57 -6.41 -7.04 -21.11 -5.14 -5.03 -11.22 -4.53 -4.62 -17.98 -5.88 - -5.48 -5.58 -5.44 Average -15.68 -14.51 -15.56 -12.08 -13.19 -7.03 -5.03 -8.39 -4.18 Other (14) est fin hun kat hau heb jpn khm vie kor lao tha luo mon -5.69 -6.27 -21.72 -22.75 -20.67 -22.76 -21.76 -22.43 -6.51 -8.77 -21.93 -11.91 -20.23 -20.93 -8.04 -5.99 -8.73 -22.83 -18.03 -20.91 -8.96 -22.92 -12.47 -19.08 -21.41 -21.89 -18.64 -21.89 -17.97 -16.63 -17.42 -22.02 -18.60 -21.15 -11.29 -21.48 -14.21 -19.67 -20.84 -19.89 -18.92 -21.66 -13.04 -4.99 -5.54 -17.02 -18.46 -11.53 -6.90 -19.24 -4.70 -7.17 -20.81 -14.48 -18.85 -20.03 -14.73 -5.39 -5.94 -19.63 -18.69 -15.08 -6.86 -20.46 -5.90 -7.83 -21.52 -17.15 -19.16 -20.07 -3.16 -3.81 -4.10 -11.63 -12.65 -3.69 -4.93 -12.80 -3.77 -5.60 -17.31 -5.99 -17.27 -12.41 -3.14 -3.30 -3.74 -7.29 -6.87 -2.51 -4.55 -6.40 -2.90 -4.82 -10.60 -4.44 -16.27 -8.19 -3.99 -4.53 -4.60 -11.45 -11.24 -3.52 -5.42 -10.90 -4.06 -5.87 -10.21 -5.98 - -8.04 -4.34 -4.55 -4.86 -5.48 -4.75 -3.12 -4.89 -4.58 -3.72 -5.24 -4.79 -4.86 -7.91 -6.03 Average -15.82 -14.80 -15.99 -12.22 -13.32 -7.23 -5.17 -8.17 -4.28 -0.24 -0.93 -3.41 -1.93 -3.16 -1.52 -2.24 -1.34 -1.85 - -1.34 -1.84 -2.67 - -1.40 -1.30 -4.18 -1.96 -5.86 -1.86 -1.67 -3.24 -2.18 -1.83 -3.59 -3.17 -2.36 -2.75 -3.10 -1.98 -2.35 -5.02 -3.18 -2.52 -1.69 -2.75 -1.41 -3.52 -2.27 -2.66 -2.99 -2.47 -2.53 -2.77 -1.36 0.24 -3.07 -2.79 -3.27 -4.08 -2.77 -22.36 -3.64 -2.93 -1.09 -1.53 -2.28 -2.83 -1.53 -6.09 -2.88 -7.53 -6.56 - - -6.55 - -6.49 -7.08 -1.36 - - -3.58 -8.86 -3.72 -3.30 -3.84 -2.36 - 0.17 -6.36 -5.08 -3.31 -5.26 -3.48 -5.36 -2.72 -2.85 -3.36 -3.73 -2.41 -3.49 -2.65 -3.34 -4.77 -3.80 -4.34 -3.38 -2.54 -3.41 -3.72 -4.62 -4.49 -1.98 -4.20 -4.83 -2.62 -4.54 -3.66 -4.85 - -3.92 -3.44
+Table 9: Detailed results (SEScore) of our evaluated models.
+Indo-European-Germanic X->En isl Indo-European-Other_Branches X->En cym gle lav lit fas ckb Afro-Asiatic X->En ful ara mit orm som Indo-European-Germanic En->X ni isl nob Indo-European-Other_Branches En->X cym gle ita lav 40 IO lit pus fas ckb Afro-Asiatic En->X ful ara ned 2Ã‚Â» 20 mit mh Indo-European-Slavic X->En Indo-Eu ropean-Romance X->En Indo-European-Indo-Aryan X->En hin guj mar, ben 40 0 pe ory urd pan snd Other X->En hau kat Atlantic-Congo X->En Austronesian X->En lin kam ind tgl nso nya jav ceb sna swh . umb msa mri lao tha wol xho i Dravidian X->En Turkic X->En Ã¢Â€Âœ kaz mal Sino-Tibetan X->En zho_sim| zho_tra uzb tel Indo-European-Romance En->X Indo-European-Slavic En->X Indo-European-Indo-Aryan En->X fra hrv i 7 ces bul hin gu) mkd, bos rus ukr sr slv ron Ã‚Â° sik pan snd Other En->X Atlantic-Congo En->X hau kat Austronesian En->X lin kam ind tgl jav msa mri lao tha Turkic En->X Dravidian En->X ursicen Tae mal Sino-Tibetan En->X zho_simp 20 as q azj tam kan ya
+# tur
+# orm
+# som
+# uzb
+# tel
+# zho_trad
+Ã¢Â€Â”Ã¢Â€Â” XGLM Ã¢Â€Â”Ã¢Â€Â” OPT Ã¢Â€Â”Ã¢Â€Â” Falcon Ã¢Â€Â”Ã¢Â€Â” LLaMA2 Ã¢Â€Â”Ã¢Â€Â” LLaMA2-Chat Ã¢Â€Â”Ã¢Â€Â” ChatGPT Ã¢Â€Â”Ã¢Â€Â” GPT4
+Figure 8: Comparison results between our evalutated LLMs on different language families.
+Language ISO 639-1 ISO 639-2/T Language family Language ISO 639-1 ISO 639-2/T Language family
+Indo-European-Other Atlantic-Congo Indo-European-Other Other Indo-European-Germanic Indo-European-Slavic Austronesian Dravidian Afro-Asiatic Austronesian Indo-European-Indo-Aryan Other Indo-European-Indo-Aryan Atlantic-Congo Indo-European-Germanic Atlantic-Congo Indo-European-Romance Indo-European-Indo-Aryan Afro-Asiatic Indo-European-Other Indo-European-Other Indo-European-Slavic Indo-European-Romance Indo-European-Indo-Aryan Indo-European-Romance Indo-European-Slavic Indo-European-Slavic Atlantic-Congo Indo-European-Indo-Aryan Indo-European-Slavic Indo-European-Slavic Afro-Asiatic Indo-European-Other Indo-European-Romance Atlantic-Congo Indo-European-Germanic Indo-European-Other Dravidian Dravidian Other Turkic Indo-European-Slavic Atlantic-Congo Indo-European-Indo-Aryan Turkic Other Indo-European-Other Atlantic-Congo Atlantic-Congo Atlantic-Congo Atlantic-Congo
+Table 10: For each language, we list its language name, ISO code and language family.
\ No newline at end of file
diff --git a/tests/integration/encoders/test_openai_integration.py b/tests/integration/encoders/test_openai_integration.py
new file mode 100644
index 0000000000000000000000000000000000000000..f132b297c8c6ca494e9b9b9f54e6b65f4f1269ed
--- /dev/null
+++ b/tests/integration/encoders/test_openai_integration.py
@@ -0,0 +1,37 @@
+import pytest
+
+
+from semantic_router.encoders import OpenAIEncoder
+
+with open("tests/integration/encoders/57640.4032.txt", "r") as fp:
+    long_doc = fp.read()
+
+
+@pytest.fixture
+def openai_encoder():
+    return OpenAIEncoder()
+
+
+class TestOpenAIEncoder:
+    def test_openai_encoder_init_success(self, openai_encoder):
+        assert openai_encoder.client is not None
+
+    def test_openai_encoder_dims(self, openai_encoder):
+        embeddings = openai_encoder(["test document"])
+        assert len(embeddings) == 1
+        assert len(embeddings[0]) == 1536
+
+    def test_openai_encoder_call_truncation(self, openai_encoder):
+        openai_encoder([long_doc])
+
+    def test_openai_encoder_call_no_truncation(self, openai_encoder):
+        with pytest.raises(ValueError) as _:
+            # default truncation is True
+            openai_encoder([long_doc], truncate=False)
+
+    def test_openai_encoder_call_uninitialized_client(self, openai_encoder):
+        # Set the client to None to simulate an uninitialized client
+        openai_encoder.client = None
+        with pytest.raises(ValueError) as e:
+            openai_encoder(["test document"])
+        assert "OpenAI client is not initialized." in str(e.value)
diff --git a/tests/unit/test_layer.py b/tests/unit/test_layer.py
index da85c10a3e8a2cd84cd3d831cf825cd7c143cf12..3f5cf09194c597925491682705e04159df8f3eec 100644
--- a/tests/unit/test_layer.py
+++ b/tests/unit/test_layer.py
@@ -87,7 +87,7 @@ def cohere_encoder(mocker):
 @pytest.fixture
 def openai_encoder(mocker):
     mocker.patch.object(OpenAIEncoder, "__call__", side_effect=mock_encoder_call)
-    return OpenAIEncoder(name="test-openai-encoder", openai_api_key="test_api_key")
+    return OpenAIEncoder(name="text-embedding-ada-002", openai_api_key="test_api_key")
 
 
 @pytest.fixture