From 8bf5b4acfdb679a3de3b1503815b1a935c23436f Mon Sep 17 00:00:00 2001 From: Alex Yang <himself65@outlook.com> Date: Tue, 2 Jul 2024 14:48:51 -0700 Subject: [PATCH] fix: llama parse input spreadsheet (#1007) --- .changeset/happy-hairs-deny.md | 5 + .../src/readers/LlamaParseReader.ts | 219 +++++++++--------- .../tests/readers/fixtures/test.xlsx | Bin 0 -> 8441 bytes .../tests/readers/llama-parser-reader.test.ts | 15 ++ 4 files changed, 129 insertions(+), 110 deletions(-) create mode 100644 .changeset/happy-hairs-deny.md create mode 100644 packages/llamaindex/tests/readers/fixtures/test.xlsx create mode 100644 packages/llamaindex/tests/readers/llama-parser-reader.test.ts diff --git a/.changeset/happy-hairs-deny.md b/.changeset/happy-hairs-deny.md new file mode 100644 index 000000000..76e9b70af --- /dev/null +++ b/.changeset/happy-hairs-deny.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +fix: llama parse input spreadsheet diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts index 2c4150e87..cc3028027 100644 --- a/packages/llamaindex/src/readers/LlamaParseReader.ts +++ b/packages/llamaindex/src/readers/LlamaParseReader.ts @@ -1,105 +1,100 @@ import { Document } from "@llamaindex/core/schema"; import { fs, getEnv } from "@llamaindex/env"; -import { filetypemime } from "magic-bytes.js"; +import { filetypeinfo } from "magic-bytes.js"; import { FileReader, type Language, type ResultType } from "./type.js"; -const SupportedFiles: { [key: string]: string } = { - ".pdf": "application/pdf", - // Documents and Presentations - ".602": "application/x-t602", - ".abw": "application/x-abiword", - ".cgm": "image/cgm", - ".cwk": "application/x-cwk", - ".doc": "application/msword", - ".docx": - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ".docm": "application/vnd.ms-word.document.macroEnabled.12", - ".dot": "application/msword", - ".dotm": "application/vnd.ms-word.template.macroEnabled.12", - ".dotx": - "application/vnd.openxmlformats-officedocument.wordprocessingml.template", - ".hwp": "application/x-hwp", - ".key": "application/x-iwork-keynote-sffkey", - ".lwp": "application/vnd.lotus-wordpro", - ".mw": "application/macwriteii", - ".mcw": "application/macwriteii", - ".pages": "application/x-iwork-pages-sffpages", - ".pbd": "application/x-pagemaker", - ".ppt": "application/vnd.ms-powerpoint", - ".pptm": "application/vnd.ms-powerpoint.presentation.macroEnabled.12", - ".pptx": - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ".pot": "application/vnd.ms-powerpoint", - ".potm": "application/vnd.ms-powerpoint.template.macroEnabled.12", - ".potx": - "application/vnd.openxmlformats-officedocument.presentationml.template", - ".rtf": "application/rtf", - ".sda": "application/vnd.stardivision.draw", - ".sdd": "application/vnd.stardivision.impress", - ".sdp": "application/sdp", - ".sdw": "application/vnd.stardivision.writer", - ".sgl": "application/vnd.stardivision.writer", - ".sti": "application/vnd.sun.xml.impress.template", - ".sxi": "application/vnd.sun.xml.impress", - ".sxw": "application/vnd.sun.xml.writer", - ".stw": "application/vnd.sun.xml.writer.template", - ".sxg": "application/vnd.sun.xml.writer.global", - ".txt": "text/plain", - ".uof": "application/vnd.uoml+xml", - ".uop": "application/vnd.openofficeorg.presentation", - ".uot": "application/x-uo", - ".vor": "application/vnd.stardivision.writer", - ".wpd": "application/wordperfect", - ".wps": "application/vnd.ms-works", - ".xml": "application/xml", - ".zabw": "application/x-abiword", - // Images - ".epub": "application/epub+zip", - ".jpg": "image/jpeg", - ".jpeg": "image/jpeg", - ".png": "image/png", - ".gif": "image/gif", - ".bmp": "image/bmp", - ".svg": "image/svg+xml", - ".tiff": "image/tiff", - ".webp": "image/webp", - // Web - ".htm": "text/html", - ".html": "text/html", - // Spreadsheets - ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ".xls": "application/vnd.ms-excel", - ".xlsm": "application/vnd.ms-excel.sheet.macroEnabled.12", - ".xlsb": "application/vnd.ms-excel.sheet.binary.macroEnabled.12", - ".xlw": "application/vnd.ms-excel", - ".csv": "text/csv", - ".dif": "application/x-dif", - ".sylk": "text/vnd.sylk", - ".slk": "text/vnd.sylk", - ".prn": "application/x-prn", - ".numbers": "application/x-iwork-numbers-sffnumbers", - ".et": "application/vnd.ms-excel", - ".ods": "application/vnd.oasis.opendocument.spreadsheet", - ".fods": "application/vnd.oasis.opendocument.spreadsheet", - ".uos1": "application/vnd.uoml+xml", - ".uos2": "application/vnd.uoml+xml", - ".dbf": "application/vnd.dbf", - ".wk1": "application/vnd.lotus-1-2-3", - ".wk2": "application/vnd.lotus-1-2-3", - ".wk3": "application/vnd.lotus-1-2-3", - ".wk4": "application/vnd.lotus-1-2-3", - ".wks": "application/vnd.lotus-1-2-3", - ".123": "application/vnd.lotus-1-2-3", - ".wq1": "application/x-lotus", - ".wq2": "application/x-lotus", - ".wb1": "application/x-quattro-pro", - ".wb2": "application/x-quattro-pro", - ".wb3": "application/x-quattro-pro", - ".qpw": "application/x-quattro-pro", - ".xlr": "application/vnd.ms-works", - ".eth": "application/ethos", - ".tsv": "text/tab-separated-values", -}; +const SUPPORT_FILE_EXT: string[] = [ + ".pdf", + // document and presentations + ".602", + ".abw", + ".cgm", + ".cwk", + ".doc", + ".docx", + ".docm", + ".dot", + ".dotm", + ".hwp", + ".key", + ".lwp", + ".mw", + ".mcw", + ".pages", + ".pbd", + ".ppt", + ".pptm", + ".pptx", + ".pot", + ".potm", + ".potx", + ".rtf", + ".sda", + ".sdd", + ".sdp", + ".sdw", + ".sgl", + ".sti", + ".sxi", + ".sxw", + ".stw", + ".sxg", + ".txt", + ".uof", + ".uop", + ".uot", + ".vor", + ".wpd", + ".wps", + ".xml", + ".zabw", + ".epub", + // images + ".jpg", + ".jpeg", + ".png", + ".gif", + ".bmp", + ".svg", + ".tiff", + ".webp", + // web + ".htm", + ".html", + // spreadsheets + ".xlsx", + ".xls", + ".xlsm", + ".xlsb", + ".xlw", + ".csv", + ".dif", + ".sylk", + ".slk", + ".prn", + ".numbers", + ".et", + ".ods", + ".fods", + ".uos1", + ".uos2", + ".dbf", + ".wk1", + ".wk2", + ".wk3", + ".wk4", + ".wks", + ".123", + ".wq1", + ".wq2", + ".wb1", + ".wb2", + ".wb3", + ".qpw", + ".xlr", + ".eth", + ".tsv", +]; /** * Represents a reader for parsing files using the LlamaParse API. @@ -165,7 +160,7 @@ export class LlamaParseReader extends FileReader { fileName?: string, ): Promise<string> { // Load data, set the mime type - const { mimeType, extension } = await this.getMimeType(data); + const { mime, extension } = await LlamaParseReader.getMimeType(data); if (this.verbose) { const name = fileName ? fileName : extension; @@ -173,7 +168,7 @@ export class LlamaParseReader extends FileReader { } const body = new FormData(); - body.set("file", new Blob([data], { type: mimeType }), fileName); + body.set("file", new Blob([data], { type: mime }), fileName); const LlamaParseBodyParams = { language: this.language, @@ -378,19 +373,23 @@ export class LlamaParseReader extends FileReader { return images; } - private async getMimeType( + static async getMimeType( data: Uint8Array, - ): Promise<{ mimeType: string; extension: string }> { - const mimes = filetypemime(data); // Get an array of possible MIME types - const extension = Object.keys(SupportedFiles).find( - (ext) => SupportedFiles[ext] === mimes[0], - ); // Find the extension for the first MIME type - if (!extension) { - const supportedExtensions = Object.keys(SupportedFiles).join(", "); + ): Promise<{ mime: string; extension: string }> { + const typeinfos = filetypeinfo(data); + // find the first type info that matches the supported MIME types + // It could be happened that docx file is recognized as zip file, so we need to check the mime type + const info = typeinfos.find((info) => { + if (info.extension && SUPPORT_FILE_EXT.includes(`.${info.extension}`)) { + return info; + } + }); + if (!info || !info.mime || !info.extension) { + const ext = SUPPORT_FILE_EXT.join(", "); throw new Error( - `File has type "${mimes[0]}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`, + `File has type which does not match supported MIME Types. Supported formats include: ${ext}`, ); } - return { mimeType: mimes[0], extension }; // Return the first MIME type and its corresponding extension + return { mime: info.mime, extension: info.extension }; } } diff --git a/packages/llamaindex/tests/readers/fixtures/test.xlsx b/packages/llamaindex/tests/readers/fixtures/test.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..fca20837be21a90ea3fde97b39ecb2a768324a55 GIT binary patch literal 8441 zcmWIWW@Zs#U}NB5U|>*WSQ32nydNV20|yHOgD?XJQ?zq_UP)?RNqk6UL27ZVUPW$> z!Xg$XjRg!$45MH~hrpW2r~M8a2)KUt_~ZOUx9jdE2M*J>O~-|HX~-u$@=Us_UwP-! zKU?AN8yyykW|?nHI)A6w)_C3-vuh&R9V_;(zA>Xef^pq9uie{{=67%ZG*NWMlHf}f z?4q}sHr@R4Ep5{cwgXL@WW&~YYua!&ma>N_eP@)JYho8Q^M&~~r4v<}(E{HWUu<5n z*ZV=x%qm{9xqBRx!_`FBOW(R@*_kjw^!)Av^Fj?Jn=Wqkc@nZ_?k`u(?<{^ZE!nJP zKlJ4o7<#<$|Ht;`v)u}lKy8g@M$5`GPqy8V`1tnmr_;Yx?RAYFTr%%n=lHWKbtRLG zGt0^i%pTW1>?zXTYW>6R;Q50ZpN#fxn)t6S@<&g<%U`yM`=>{rFiLNjv5oD+t<>|2 zmY=w?zVZuWW|8ur`E?R!?RN91uoz}q@I4Qh-0w2$SND4Ezq^ALu-#DF|M(?80|UeV z|I7@i*>G#9f$|py1_mD{1_o{h4yO2`)SO~{JrFsX{YG@IgL{6>Bwzo-20U%=J^r+x z_<r^71OX@Saz~HdiXn9ko_F}S_AXYDzPn#tMj%A0nc2O5Z{43$x9{5YzB-}!?UBiv zZXpqC<!$9_+ou02EM?bHE1Kq3xaQ#z!LqAguYQ~9?p3K#wzpqv^^O$_y-(g(cDwx1 zWuw4(y>KZ`;iWQdVzx6IqmKDT-b?zij`6Bf@Y}U*8@8J-cs2R_?E34|mc8V)@38cF zb7`jNiQ=U6o9k*$<S2gqDHR%=VVSuhX<m_)Qe}Kf&VkP^v-b=Ct<m+Hb0SE|MCgan zdH3V03%=GzZ87<HdT!+rbI!x7PkcDp)Gfc_9wW9)B2YX{w}P30;SdJ{11K9+<mi{@ z7iA~q=VycJsWqXu^KM%R)W#?Lcf7FMDd^f+r#DXFISorVWXx~YYDskVb$BeAz9@;^ z@&8`un?;kVHWpl#GSe(_Yfn$(JN|tBHs4$u!<&T{&-oO!`8IED)u{Y!V7y<aSlgi2 z@8Y&4TQhE+sY#prGxOhb`RCG#^6y(KOc)ltO-oyP#mZ%cw$`-^?|jxoa!;E&fj4x+ z6#f;_-fpMYCHa_|ZZiB9;9QygPQa;>t1V(}#lfJ!FTB}T`CK;5(pqpowB~MahHoKP zjPujv{oA-Mu3>yM?|O_a|AJ)~MIIh?)>hfOUh$1?Vei_O_3ZXdi5rejK2TB;a+q7w zYGY73LvPuV{zvK(f}iwQZ$Flq)AmH+<h&j09yU(>Vt3+{bwcI(1eX6sQjD3rQJjjE z@-D^vcXEpB_B}G)eb@8)g4bWuZ@uukx-9UrU#e>OnmIG=bdI*|y*=Tt)w@dGuqIm@ z(~Ex;o@ZxH$#MN1rR#V?`Oldx8*L9pR5uwciZZ+zbX4e~)`f?5>*BB8X_y<rB$B6d zW54Zm9zC%a+)1qy#T56xU$=5qPuw+Q;o6r96-P363odF&xuPLDdz)Cn$qUk@mmaVF zApC1dE1#6(73;L!&*hC7SW+#rK16vk&(pu;z2Y!$ZPO};x2OLu-P^E*JwE=<fwOl! z^P>IcxQB^ZHj3W7vH8(~&c7<Y1%7Jx6}Vouwl3Ja>f6+7GmI}cZjzh1?nT)a0hw=m zYHa_rzPFz?uU^9Kwz=%ecbVJg<n9XheX(@o-p9F`W(t!0CdpFgAK#jPYr`F$Yu0hA zxQ+-VJm2tL{4u|1fZD#r=8k`6-8xfqFw&@(qflq|jhRzb`jUN(?YGwC&tg|OZ1Ht- zqfpi1%#8U5a%c3GP59HBx02=8mU*RB4<p-t@l*y*T3GWqn{_SsgdOwPjyEJ!U$DQu z>a^e4^NDPwg$vweWPVn>im;fyW16r}u6BXiwy72?=kBbVmbP&<_mbQ7KdfgxQ212v zC;p<2w$|04OOFKq^@`<)Ugle}g`1;dpQ`k3#~yWuizhwvEK|DHT6~i^?fbd&^<<Ni zcm7OW!&m?HpqR*lhq-Q{{MUo;pGlN`WqAL+oLTtp-rBkM?(Qm>`0CV~`~U2ZOLy!` zQ@47)UENA;{`zU*>c8UBPOmTjx$@k_&^d0)f4g_PemQUSyV{27Z@SaPfC5`SBiF?~ z&+cQ*q`mF83LY>pFz7NdFi0_QFjnN~Ln<@mtO~2x7&R6!Fv3f?5z1II@pRT<P`j%2 zU-OJw+cK`DQ%>BR;<WAPUk1f~`6JU>(k5N2kAK*7*TKW=CdWFqyTvNG<@S#k_=)WN z?&m!5>4FoH@!!{7{SaDfv$rY9aGhCm`I1DXw%DIPHvC<3(8KCTtbD{$vEx!!6F;<_ z*tYm$!87KaCYd!dy-Osbd5+2Xc<(s*S-j`J^#YsMA-|)X?_58>V3nrIr?+|cP4+G= z&+k{Z+}YqTEz?6hU}~psXWRZcs_q9)|2S*=aniijuwOETaoyQ`M{Q;I)!G!etE@i0 z>8ORjn550`C!fTz72>%w+H{$i7#OxPGcZUnFfc$1@#2is)RJO-Fl7iT&F4(Mo%Ps2 zpzS^PZ$78*eKk?$hJ{|2w&c9``d*@Y`f-E{yRxS5p-biOZ!CQ2l;`BgRJr75THVX< zf8H$+tz><^)Kt`0%}_O=C3nfqkT2iYAJ6dW<iBGQ6Ui)U+qAsAJKy~C-qPs%j4Vwo zkF{^LUYOofd9m?XT~f8dj>}QYD-ZZ@Puy32V78&B_OxS#2Ti88&Er_QyDjl{7*|B} z#>)wZI3GEDZ20@z@7urTWk*dd&fYD%?tbW%oZ!|FBN=xM)u<`{(#f*xMPJ#eSc*S+ zP_V>ebH7pYpGA|V9g57qZQy8R{k2B3{#aU1xADuM|DiKf()r?^erTA#;!C!!>izR; zz6ejN6%3n`c(`<Dk^1bef8LuHbu3?Pb1Ln%&aU3i?HSw7yjSM&pWOcSZPr$udoPmi zdDJ{~6fL|ERx&eS+l>BaKa~^T|Bs%0;Pw0k4lD<lI8~d?v$MA!KB&%f)LCoyiCUSx zKi_zqwY?MlHQa2WGs8b--YqrD4|U)D&#?0)lN<LHgBc5Q`m#ST-EDv1+4Z6v+-3z6 zn>5dP`05sV>Udt+&I}d-6Pq+WeEh%q>Uf<$qaAj_(^q?AsL=)E@F0y-KIa=YdVk_R zuxObVm*-Da)t@^J;@q;5ys`v4($_4RFl~N3)8iMnZ@!rRTs&1geaW(EH$VjfW()aS z!WPFJ><kQoN}!G^INg+Fq~@mTgGq3bc@uTEUwXUfe)bdp9nQpFJM*GC(elD_^Kb8} z9!`yY;^8@^E$&=_L(pz*5k{`(SG*gUGB0{J-mB8+V#{b?+xev5FhSY(e}_$x+pLFO zU3>pIOiC)(H@@=T_TRI;$B&!$7k{6;dd5O-C6%Y1*SX*QkW)I@b3Sv^n<{^`$6mLc zgC3^zN&0(Sj~0GeTlD^M|GP)bq7!E*Oz?lg_^xVV&`ZayTImKoPi5y#VyTNR5KLy> zo)kHkyN0tWIgGb<e`K$rug#PP5?Ab^bYAo<=nQ*#_qK+-;SS#^SD1~}&TYv`@|1Y< z$L0dZ&S}hQ7g*C``Jc`Fe42A-kX%xP){lj+e?B|E`rhv7qwG5cf~!-!i;_Q2KmC;H z^XCg^G^bsDyX2on&h_TCb<7D_;^tYe|F5-UKCoHRCiB?RHO3hxf6HbhdoOr%Y2oh6 zTv=xOmh9hl*z$$R@5H;8@*QUW`}NMa&w6p<;X~2^ZTW8=RM*{je??~xW9pA*eEaNP zZ9O!x)qFw6#V0l5cXPJhJ}a}gRmZ}*<dF5*W%s3?ePCj}xS?0i_GpuOR{WZyKP1JI z88Q?Ocd2=-4c>IXxZ_dWsvR!QTET7C<-7_Hsq*|z6mWQP^>4xNe4Rdi4n5E8xC1Wl zcrxug&awE0Uaw||+AH+rsrmgq75~0=KmT9(@u;}|y#L&?7gLuQg-*Mp=r=8Or~JQv z$N&HRTK;1H|9`jn<NuxPou2w<b^X78SGV8)JEbZjt5q}Z-_O<I@%3Ln>DT}GCm(OM z<vYu~X%B^^gggCCs8*dxu5d`^nr%O`YqqD4l8-K@Ok-aAtBGAY4^B!Mi?8#~5w;VZ zkQ#H~_v&=9>zh5+h?JiBW6-7h!0`E+^R1T|_`k-ea{Gs!ykxsGVrpT(;l6^iwg)<& zKDemW`o$|&i(PPI#yq1r8hcbY13ms9JI`5Yb2oiTRGIBE<AbX48eb$`YklUb&AhNu z)T8x+V(&ryY;nhet*NsgFJEzf=IfR1PY$^q+H~NIW1Z8C)MLLZmKMATvOH+0E&YdM z^$FKcIl5`7cRaK{e3@j>=T@w=)ce`QhUdDLe-+z~JS+Ka7=7F<NTcfArOFmdedYE) zH3APUN{g0xh8(NYI<M$rX!Z0ItNpWYTo=DTV4w4*^n0+EPWgU2L7PUF@MR$^9sg2Q zW%ljy-?3NjdQM}f_USJ!IjPs@Ejah$#rkPs8S;XMKB&$UZkku~D&fNOm8qYbzBF@e zI;B|1dW|{f_^N&W9W6$l*G+e8e(LbjPC0*K())+Sd-hGMxMtD0@y7C>EPL`=--R4V zDha4KlJ-3H0cYzc{zpeMr??B1Olhj#;{Q6cW$j+?``+by*0ny}ZyuSvL2v%ulGrAm zof~;nzL{@3%VM0}q?DjGW07+BE>^+Qe_K25h4<b#rMZ2Bc<w_sjh;1C>dQ{P?wT*G z!?OQRW!8NDu&+Br_j~W%tiN6+ZgtVOhjp!6&kC%c6wGw{WZt7)y19AY((nK8+j4XJ zw0!Pax0{}ATTr;SsdhrsvK5I7_Gm`gF8e6_{_SrajpX#KvnM8+td-|nZ~EDgXV$za znLc^H8!}m@|Ef%2;@&QD<AlP&tNAOs&MD1ts@E`xJ+t&osJ^P#&X#QxCC&bZ6!~t? zv|IZq+3x(Uxy$xHNmrW^A^NzES<3&W#g<heZF$>1IiAgTzPllIR@m#dhG}~_J1?b} zb<S#b$u|BWCA6#fTU|tnjLxm7zsEwYmoa2TuRl<2-n9Mey6Y>_dlo3OoBoRVty%o_ z*zQ}F@2_6|-E;DkAj8_DvHs;I*Z1}>pRdQg)%?R2*FzSn+}g8SK2FvAkiwFGxUyv0 z?|1g11?FE%mlv#b^`D)zS&B<)?YH(1Tm6)-7cqOEn%SiKLg-orNB22r<2hw}S-<o? zH1bodSW@@>P!4CizOm{i(f?*ERYik%H})8H&t5b6r~0xkjw2a&zJB?ff4EUqtuy6` z)rOM#l~0`ZKB&ula^Us0=_ivM(^kFteyL+yNg21MowwMz_}(A4o42aOH|{>a=%?ze zO;am+6pu))b*L`)Q<aftnYsG8$hpJYX0GYjpKTpH@9$dYbxQKd)7&iX2c5cU(>$9u z)%*CJ+}wa&r{_!Z?8-j$GU)w=_j}&%t6P6<n)bKjej78RGF|g$zsvdcu<WPM>Ll^4 z5ve<WSDkyCGf9?ZRq^bK?|b*VtetUY(K5;R&l2Zu`5ZX=R`nVVuigH|{dc#`Zn-R! zzx!+Rov$mdS|3@q!K5*t@7(G1+cm#<CLI^>6Aag}ew)U5XXYg%ThU|NgYDv<Pxndd zRQtSB&ueqYzZp6LbK^_*UI?{()_%Ow^nj4V>{;SrH^MGNzP;I0wfVH`X4c)J=T6<o z{<=zc&(s?-oAX+IvoE?v$L9K7DP5bFc8%}!yjx2WlQnnT+N!kWBx~CxOY1Xdi}at& z)SeoCtDrTksci54U%hd)0%mNY`|nDGzS_DgKGVEvtJnIAD_9r1J>dWS@VSG(YvF|x z`)VW(X9`3=zi&U|<J^3OM`s(a&z0<7eW&JefayK+`wlD*|K%$jnCPG_!}@iDIG<K? z;)OL=4kvEDly|jis^blrHrpFpW<S37TyB@ag9Y|7eaCw@T#{i}_wKvY)VTd-D@<-L zuCw5C`?`?-LCx=-P1_~r78ge!So*N^y*s$+fIbwqQ*3|AN+t$|a5e@8UPuG6xTG>C zwHVYYToRg_ecM3bpWTgmfnV=Vl|9zs+~$4xO2CU#3uND{PCDJWBCco2l5@xYRr{C4 z$yjZ>Sl8*l{BrU6yUMYbR)=wINj+Ay)<`C{MZz%jRoCj!w*B^^2U(O~uUNI>LRV_V zah-jq?CZVO-sb$aBkhc0Lx9=g?B}y26}QD0d%kVWo6$5kO4?Ia=WN%FV+K)?^&VV8 z?GI;ooDzK}n6~MUynyG~&&xQybfsVYn8f|AOjgWo)113&r<j#!He^ZnW^~@l$Vs{~ z%X!kd-)j9&mR=0I{*I}jb6s}artKxmQ@k=NotU0{S9Im%e7yNq^k=^%f`&P#_tseW zXRn>(y7Zm~dlFON{id6nFX^xP5Pk2a`_q+wJGX^2s2@}D_txFw$hV~Voa?>CD^hBY zls?|#%6~=T@1yc8t(`Z%R^NT-_o;Tn6NQF;rFm!mB|j7pf8?S2P3OSG<qH-jyNPaa zsQ#UKoq?V8`r7uFkNyenH7XH5dh!9oOQB~kgJyHB`@T|>J$xQln6*OIkF2-*{meh? zzH;uM=!wf+m#2n>{0OR=u$b}b6rT?F|15P^(r@JUUU3t<bT?G8?f8?~1qKTz?7jK; zD|f%W%j=EyJl(f`$;b<^Z#nNFe_2_-s!jgSY|(<+Zvq)RA08_F%6_OJa_e%jmN>@e z_v5e6NfvwIax`m~;JcSQW<H$!cWP*j^5f(y=KHR$Yd-Pe<i77-uF3P8x4ye~=xdYy zEs1##<F|j-zkD=TkbQE29jDuL9nNJZYXnsm&;FSd=yTX&&rjPKO!6}r?_M|+ToG%m zoe(smcH?g$dH;Pp{9?9$-}N!QRs5c5!?%9222a;%11`5@D^9h?r&Tm2X~gxof4;rM zgYC}b@^f7iTmtP&ZGNUa+mX_=x;c!!wJlD7b6UM-lgt!_<@GIpM6q>x1d`geIx#XZ zWHK=@2r_UmrsO9F6y+Bb>nG<IrGnD6!Xg&N(Y`PP!<0#}-iHk&THmw&cRpD=d&}4E zOJxg;1Y|h;)b6!Dxhs}q=dtMSzrC5eufE#z(a7?7ZP7XJQ#bw0E~-bUZ&?*EOF*PC zX%_d^l^0GI$=zRYqH>i*?v^I5w+9zS*4Bj?1smi(JekFOjk_RFK{MPSc;VXb%MRSw zsO#}+(t`lUm~8Ep{O-FCwrwik#W+W>s8zM%)XL>Mn2)8YEMlItSuoQs=Q6X!;R_8c zPi6fki_}-!M!ftd^I~deFxP(bg_jlIO*wm0q3_*X;l&$sZ!bCfTSBqED_*?+eC^eU z6?x%20kK742HaP&qwNDk*KT>1ebGd99>-lF=~sW0-c;*0+<#YJ>Hqb>+iaeK2Z0=6 zJf?l8wD#|gKNmQ6;u|Hk>xL(%NosyNt)!ZN_p|-8-3t!4+O+)OH$1I=M0tVU$CJ-j z?8jP|9*T?Pn83)uAkECcAb_0k5(^4O(;Z^KY0f0y{M(?xb>?64Ja?rna|3xA*Ccs_ zf7y`4IH@e%Z&~ufO)4F0Cce4uzwP3cn`O6p<@Mw27Oi;4H&?sLC#8A)k)psZroEh7 zzy9{Jc`v7D-`ylp!>d=Z=7^x@otNGIS(BZP91&Z2LTsMaMWf?Wr=&!^X%O4_ws<+` zO6GH!w@zO$x%QyeAYZm>T6k)u_QwM+ve)Q8J{jQCw|d_H>s$EJ9xR@I*Dx(%?Y<)~ z)-ax1f5rdW5{HEkA9414m06*?Pmae*^|{eh%k3HWCOuuOH+9bPtn331t@<zUS~#iN zbG;HwjJVI1qqMdv?rVm{lBPH7_Z-j;{KImHagC6z3imwY2MV(%8J117cqy;AH>mr> zk`ytYqlR;`pKsiI@pRyX=HiD528~aSSw1?q$@$m49EFee9n*^+O-y_=w_qFdeUsvI z=606;zYbT+?&A12Lwfm(*<aUxktu&DCvpFJWn9Q@F`;We+5KMIiC#&*mr~Sxau@6V z<G*4*mnT~^Pk(vs>!-57xamLR(@j<F*G2tjgB72QOu7uXrl}F}24P{Dv<~n_H2`&j z4WbP~wlgy@Kt?F_K+|pL8qp^mAeunr8aD7K1oF5hx@OeDP>?1V4&(xBhFFYj1Xf3& z51PSDVPGia0h<Fd0X%kwJcNes8uTF)gefcez@~tQQji8z&<#Lunjs9BBnCDB(NIIz yidq9AG%_%xNir~C)Q9NWQOhx8?f0dTv_ng^0B=?{kQz<~P6j(B28N|_ARYjGAkek| literal 0 HcmV?d00001 diff --git a/packages/llamaindex/tests/readers/llama-parser-reader.test.ts b/packages/llamaindex/tests/readers/llama-parser-reader.test.ts new file mode 100644 index 000000000..b43cf9915 --- /dev/null +++ b/packages/llamaindex/tests/readers/llama-parser-reader.test.ts @@ -0,0 +1,15 @@ +import { LlamaParseReader } from "llamaindex"; +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { expect, test } from "vitest"; + +const fixturesDir = fileURLToPath(new URL("./fixtures", import.meta.url)); + +test("file type should be detected correctly", async () => { + const xlsx = join(fixturesDir, "test.xlsx"); + const buffer = await readFile(xlsx); + const { mime, extension } = await LlamaParseReader.getMimeType(buffer); + expect(mime).toBe("application/vnd.oasis.opendocument.spreadsheet"); + expect(extension).toBe("ods"); +}); -- GitLab