From 081a4d7118e976f416894c1745fef61f8f73becc Mon Sep 17 00:00:00 2001
From: Logan <logan.markewich@live.com>
Date: Thu, 6 Mar 2025 12:57:41 -0600
Subject: [PATCH] anthropic caching and thinking updates (#18039)

---
 docs/docs/examples/llm/anthropic.ipynb        | 773 ++++++++++++------
 .../llama_index/llms/anthropic/base.py        | 118 ++-
 .../llama_index/llms/anthropic/utils.py       |  18 +-
 .../llama-index-llms-anthropic/pyproject.toml |   4 +-
 4 files changed, 660 insertions(+), 253 deletions(-)

diff --git a/docs/docs/examples/llm/anthropic.ipynb b/docs/docs/examples/llm/anthropic.ipynb
index 57b4374727..82619934cd 100644
--- a/docs/docs/examples/llm/anthropic.ipynb
+++ b/docs/docs/examples/llm/anthropic.ipynb
@@ -15,11 +15,9 @@
    "source": [
     "# Anthropic\n",
     "\n",
-    "Anthropic has recently released its latest models: `Claude 3 Opus`, `Claude 3 Sonnet`, and `Claude 3 Haiku` (which will be available soon). By default, the `claude-2.1 model` is used. This notebook provides guidance on how to utilize these new models.\n",
+    "Anthropic offers many state-of-the-art models from the haiku, sonnet, and opus families.\n",
     "\n",
-    "1. Claude 3 Opus - claude-3-opus-20240229\n",
-    "2. Claude 3 Sonnet\t- claude-3-sonnet-20240229\n",
-    "3. Claude 3 Haiku - claude-3-haiku-20240307"
+    "Read on to learn how to use these models with LlamaIndex!"
    ]
   },
   {
@@ -40,16 +38,6 @@
     "%pip install llama-index-llms-anthropic"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "50fc1a30",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install llama-index"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "3cbf8694-ad53-459a-84c1-64de2dadeaf5",
@@ -57,7 +45,7 @@
    "source": [
     "#### Set Tokenizer\n",
     "\n",
-    "First we want to set the tokenizer, which is slightly different than TikToken.\n",
+    "First we want to set the tokenizer, which is slightly different than TikToken. This ensures that token counting is accurate throughout the library.\n",
     "\n",
     "**NOTE**: Anthropic recently updated their token counting API. Older models like claude-2.1 are no longer supported for token counting in the latest versions of the Anthropic python client."
    ]
@@ -81,7 +69,7 @@
    "id": "b81a3ef6-2ee5-460d-9aa4-f73708774014",
    "metadata": {},
    "source": [
-    "#### Call `complete` with a prompt"
+    "## Basic Usage"
    ]
   },
   {
@@ -93,7 +81,15 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"ANTHROPIC_API_KEY\"] = \"YOUR ANTHROPIC API KEY\""
+    "os.environ[\"ANTHROPIC_API_KEY\"] = \"sk-ant-...\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "712ea8f4",
+   "metadata": {},
+   "source": [
+    "You can call `complete` with a prompt:"
    ]
   },
   {
@@ -108,9 +104,9 @@
     "# To customize your API key, do this\n",
     "# otherwise it will lookup ANTHROPIC_API_KEY from your env variable\n",
     "# llm = Anthropic(api_key=\"<api_key>\")\n",
-    "llm = Anthropic(model=\"claude-3-opus-20240229\")\n",
+    "llm = Anthropic(model=\"claude-3-7-sonnet-latest\")\n",
     "\n",
-    "resp = llm.complete(\"Paul Graham is \")"
+    "resp = llm.complete(\"Who is Paul Graham?\")"
    ]
   },
   {
@@ -123,21 +119,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Paul Graham is a well-known entrepreneur, programmer, venture capitalist, and essayist. He is best known for co-founding Viaweb, one of the first web application companies, which was later sold to Yahoo! in 1998 and became Yahoo! Store. Graham is also the co-founder of Y Combinator, a highly successful startup accelerator that has helped launch numerous successful companies, such as Dropbox, Airbnb, and Reddit.\n",
-      "\n",
-      "Some key points about Paul Graham:\n",
-      "\n",
-      "1. Programming: Graham is a skilled programmer and has written extensively on the subject, including his book \"Hackers & Painters: Big Ideas from the Computer Age.\"\n",
-      "\n",
-      "2. Essays: He is a prolific essayist, writing on various topics related to technology, startups, and entrepreneurship. His essays have been influential in the tech startup community.\n",
-      "\n",
-      "3. Lisp: Graham is an advocate for the Lisp programming language and has written several essays on its advantages.\n",
-      "\n",
-      "4. Y Combinator: As a co-founder of Y Combinator, Graham has played a significant role in shaping the startup ecosystem and has mentored and invested in numerous successful companies.\n",
-      "\n",
-      "5. Wealth and inequality: In recent years, Graham has written about income inequality and the concentration of wealth, sparking discussions and debates within the tech community.\n",
+      "Paul Graham is a computer scientist, entrepreneur, venture capitalist, and essayist. He co-founded Viaweb (one of the first web application companies, later sold to Yahoo! and became Yahoo! Store), and later co-founded Y Combinator, an influential startup accelerator that has helped launch companies like Airbnb, Dropbox, Stripe, and Reddit. \n",
       "\n",
-      "Overall, Paul Graham is a significant figure in the technology and startup world, known for his contributions as a programmer, investor, and thought leader.\n"
+      "Graham is also well-known for his essays on technology, startups, and programming, which are published on his website. He created the Lisp dialect called Arc, and authored books including \"On Lisp,\" \"ANSI Common Lisp,\" and \"Hackers & Painters.\" He has a PhD in Computer Science from Harvard and studied painting at the Rhode Island School of Design and in Florence, Italy.\n"
      ]
     }
    ],
@@ -147,362 +131,359 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f25ccf92",
+   "id": "09c27c58",
    "metadata": {},
    "source": [
-    "#### You can also use an anthropic model through Vertex AI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c4a9db35",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "os.environ[\"ANTHROPIC_PROJECT_ID\"] = \"YOUR PROJECT ID HERE\"\n",
-    "os.environ[\"ANTHROPIC_REGION\"] = \"YOUR PROJECT REGION HERE\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ed545c13",
-   "metadata": {},
-   "source": [
-    "##### Do keep in mind that setting region and project_id here will make Anthropic use the Vertex AI client"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bd125110",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from llama_index.llms.anthropic import Anthropic\n",
-    "\n",
-    "llm = Anthropic(\n",
-    "    model=\"claude-3-5-sonnet@20240620\",\n",
-    "    region=os.getenv(\"ANTHROPIC_REGION\"),\n",
-    "    project_id=os.getenv(\"ANTHROPIC_PROJECT_ID\"),\n",
-    ")\n",
-    "\n",
-    "resp = llm.complete(\"Paul Graham is \")"
+    "You can also call `chat` with a list of chat messages:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "de92ce84",
+   "id": "7a79dd31",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Paul Graham is a well-known computer programmer, entrepreneur, venture capitalist, and essayist. Here are some key points about him:\n",
-      "\n",
-      "1. Co-founder of Y Combinator: Graham is best known for co-founding Y Combinator, one of the most successful startup accelerators in the world.\n",
+      "assistant: # THE TREASURE OF CRIMSON COVE\n",
       "\n",
-      "2. Programming language creator: He created the programming language Arc, a dialect of Lisp.\n",
+      "*Arrr, gather 'round, ye curious soul, for I be havin' a tale that'll chill yer very bones!*\n",
       "\n",
-      "3. Entrepreneur: Before Y Combinator, he co-founded Viaweb, one of the first web-based application companies, which was later acquired by Yahoo!.\n",
+      "'Twas fifteen years ago when me and me crew aboard the Salty Vengeance caught wind of a treasure most rare - the Sapphire of Poseidon, said to control the very tides themselves! The map came to me hands after a particularly spirited game o' cards with a one-eyed merchant who'd had far too much rum.\n",
       "\n",
-      "4. Author: Graham has written several books on programming and startups, including \"Hackers & Painters\" and \"On Lisp.\"\n",
+      "We set sail under the cover of a moonless night, navigatin' by stars alone to reach the dreaded Crimson Cove - a place where the water turns red as blood when the sun sets, on account of the strange coral beneath the waves.\n",
       "\n",
-      "5. Essayist: He is known for his insightful essays on technology, startups, and society, which are widely read in the tech community.\n",
+      "Three days into our journey, the skies turned black as pitch! A storm like none I'd ever seen! Waves tall as mountains threatened to swallow us whole! \"HOLD FAST, YE MANGY DOGS!\" I bellowed over the howlin' winds.\n",
       "\n",
-      "6. Investor: Through Y Combinator and personally, he has invested in numerous successful startups, including Dropbox, Airbnb, and Reddit.\n",
+      "When we finally reached the cove, half me crew was convinced the treasure was cursed. Bah! Superstitious bilge rats! But I'll not be lyin' to ye... when we found that hidden cave behind the waterfall, and saw them skeletons arranged in a circle 'round an empty chest... well, even ME beard seemed to tremble of its own accord!\n",
       "\n",
-      "7. Advocate for startups: He has been a strong proponent of entrepreneurship and has helped shape the modern startup ecosystem.\n",
+      "The real treasure weren't no sapphire at all, but a map to somethin' far greater... somethin' I still be searchin' for to this very day!\n",
       "\n",
-      "8. Computer Science background: Graham holds a Ph.D. in Computer Science from Harvard University.\n",
+      "*Leans in closer, voice dropping to a whisper*\n",
       "\n",
-      "9. Influential thinker: His ideas on startups, technology, and society have had a significant impact on Silicon Valley and the broader tech industry.\n",
-      "\n",
-      "Paul Graham is widely respected for his contributions to the tech industry, his insights on startups and innovation, and his role in shaping the modern entrepreneurial landscape.\n"
+      "And perhaps, if ye prove yerself worthy, I might be persuaded to let ye join the hunt! HARR HARR HARR!\n"
      ]
     }
    ],
    "source": [
+    "from llama_index.core.llms import ChatMessage\n",
+    "from llama_index.llms.anthropic import Anthropic\n",
+    "\n",
+    "messages = [\n",
+    "    ChatMessage(\n",
+    "        role=\"system\", content=\"You are a pirate with a colorful personality\"\n",
+    "    ),\n",
+    "    ChatMessage(role=\"user\", content=\"Tell me a story\"),\n",
+    "]\n",
+    "llm = Anthropic(model=\"claude-3-7-sonnet-latest\")\n",
+    "resp = llm.chat(messages)\n",
+    "\n",
     "print(resp)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "d6ad0a98-92dd-48fd-9823-175d701c1ab2",
+   "id": "ee87da7e",
    "metadata": {},
    "source": [
-    "#### Call `chat` with a list of messages"
+    "## Streaming Support\n",
+    "\n",
+    "Every method supports streaming through the `stream_` prefix."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5fd3137b-05ce-40a5-bdb0-5ce048f5ca25",
+   "id": "e49a2681",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Paul Graham is a computer scientist, entrepreneur, venture capitalist, and essayist. He's best known for:\n",
+      "\n",
+      "1. Co-founding Viaweb (later sold to Yahoo and became Yahoo Store)\n",
+      "2. Creating the programming language Arc\n",
+      "3. Co-founding Y Combinator, an influential startup accelerator that has funded companies like Airbnb, Dropbox, and Stripe\n",
+      "4. Writing influential essays on startups, programming, and technology that are published on his website\n",
+      "5. His work on Lisp programming language\n",
+      "\n",
+      "Graham is widely respected in the tech and startup communities for his insights on building companies and technology development."
+     ]
+    }
+   ],
    "source": [
-    "from llama_index.core.llms import ChatMessage\n",
     "from llama_index.llms.anthropic import Anthropic\n",
     "\n",
-    "messages = [\n",
-    "    ChatMessage(\n",
-    "        role=\"system\", content=\"You are a pirate with a colorful personality\"\n",
-    "    ),\n",
-    "    ChatMessage(role=\"user\", content=\"Tell me a story\"),\n",
-    "]\n",
-    "resp = Anthropic(model=\"claude-3-opus-20240229\").chat(messages)"
+    "llm = Anthropic(model=\"claude-3-7-sonnet-latest\")\n",
+    "\n",
+    "resp = llm.stream_complete(\"Who is Paul Graham?\")\n",
+    "for r in resp:\n",
+    "    print(r.delta, end=\"\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0d38e262-c98d-4780-aef0-efc00c251da6",
+   "id": "2cffe6ec",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "assistant: *clears throat and speaks in a pirate accent* Aye, gather 'round me hearties and I'll spin ye a yarn of adventure on the high seas!\n",
-      "\n",
-      "T'was a dark and stormy night when the Black Pearl set sail from Tortuga. The salty sea spray stung me eyes as I stood at the helm, guidin' me beloved ship through the roilin' waves. Me loyal crew scurried about, securin' the riggin' and battening down the hatches. \n",
-      "\n",
-      "Suddenly, the lookout cried \"Ship ahoy!\" and pointed off the starboard bow. I raised me spyglass and spied a Spanish galleon, her decks heavily laden with treasure. The crew gave a hearty cheer - we'd be feastin' and drinkin' well tonight!\n",
+      "Paul Graham is a computer scientist, entrepreneur, venture capitalist, and essayist. He's best known for:\n",
       "\n",
-      "I ordered the crew to ready the cannons as we drew alongside the galleon. \"Fire all!\" I bellowed and the Pearl shook as the guns unleashed a barrage. The Spaniards returned fire but they were no match for me skilled gunners.\n",
+      "1. Co-founding Viaweb (later sold to Yahoo and became Yahoo Store)\n",
+      "2. Creating the programming language Arc\n",
+      "3. Co-founding Y Combinator, an influential startup accelerator that has funded companies like Airbnb, Dropbox, and Stripe\n",
+      "4. Writing influential essays on startups, programming, and technology that are published on his website\n",
+      "5. His work on Lisp programming language\n",
       "\n",
-      "We boarded the galleon, swords flashin' and pistols blazin'. The fight was fast and bloody but in the end, the Pearl was victorious! We claimed the treasure as our own - mountains of gold and jewels glintin' in the moonlight.\n",
-      "\n",
-      "As we sailed away, I couldn't help but grin. T'was a fine night of piratin' and I knew many more adventures lay ahead for me and me crew. No matter the danger, the Black Pearl would always prevail! Yo ho ho!\n",
-      "\n",
-      "*laughs heartily* And that, me friends, is a taste of the pirate's life. May yer sails always be full and yer horizons bright. Fare thee well!\n"
+      "Graham is widely respected in the tech and startup communities for his insights on building companies and technology development."
      ]
     }
    ],
    "source": [
-    "print(resp)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "56a55ce6-08e3-4534-9bae-345686308b3e",
-   "metadata": {},
-   "source": [
-    "## Streaming"
+    "from llama_index.core.llms import ChatMessage\n",
+    "\n",
+    "messages = [\n",
+    "    ChatMessage(role=\"user\", content=\"Who is Paul Graham?\"),\n",
+    "]\n",
+    "\n",
+    "resp = llm.stream_chat(messages)\n",
+    "for r in resp:\n",
+    "    print(r.delta, end=\"\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "57901d1c-d1d4-442e-bb91-cd8f054ae2fd",
+   "id": "0624c7bd",
    "metadata": {},
    "source": [
-    "Using `stream_complete` endpoint "
+    "## Async Usage\n",
+    "\n",
+    "Every synchronous method has an async counterpart."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "cd9e2b22-7e62-4f50-a9af-84453aeda071",
+   "id": "94f72965",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Paul Graham is a computer scientist, entrepreneur, venture capitalist, and essayist. He's best known for:\n",
+      "\n",
+      "1. Co-founding Viaweb (later sold to Yahoo and became Yahoo Store)\n",
+      "2. Creating the programming language Arc\n",
+      "3. Co-founding Y Combinator, an influential startup accelerator that has funded companies like Airbnb, Dropbox, Stripe, and Reddit\n",
+      "4. Writing influential essays on startups, programming, and technology that are published on his website\n",
+      "5. His work on Lisp programming language\n",
+      "\n",
+      "Graham is widely respected in the tech and startup communities for his insights on building companies and technology development."
+     ]
+    }
+   ],
    "source": [
     "from llama_index.llms.anthropic import Anthropic\n",
     "\n",
-    "llm = Anthropic(model=\"claude-3-opus-20240229\", max_tokens=100)\n",
-    "resp = llm.stream_complete(\"Paul Graham is \")"
+    "llm = Anthropic(model=\"claude-3-7-sonnet-latest\")\n",
+    "\n",
+    "resp = await llm.astream_complete(\"Who is Paul Graham?\")\n",
+    "async for r in resp:\n",
+    "    print(r.delta, end=\"\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "65d68dfc-a97e-4a69-935a-e675fb7b4ed0",
+   "id": "4b04cfd0",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Paul Graham is a well-known entrepreneur, programmer, venture capitalist, and essayist. He is best known for co-founding Viaweb, one of the first web application companies, which was later sold to Yahoo! in 1998 and became Yahoo! Store. \n",
+      "assistant: Paul Graham is a computer scientist, entrepreneur, venture capitalist, and essayist. He's best known for:\n",
+      "\n",
+      "1. Co-founding Viaweb (later sold to Yahoo and became Yahoo Store)\n",
+      "2. Creating the programming language Arc\n",
+      "3. Co-founding Y Combinator, an influential startup accelerator that has funded companies like Airbnb, Dropbox, Stripe, and Reddit\n",
+      "4. Writing influential essays on startups, programming, and technology that are published on his website\n",
+      "5. His work on Lisp programming language\n",
       "\n",
-      "After the sale of Viaweb, Graham and his wife Jessica Livingston co-founded Y Combinator in 2005, a highly successful startup accelerator that has helped launch"
+      "Graham is widely respected in the tech and startup communities for his insights on building companies and technology development.\n"
      ]
     }
    ],
    "source": [
-    "for r in resp:\n",
-    "    print(r.delta, end=\"\")"
+    "messages = [\n",
+    "    ChatMessage(role=\"user\", content=\"Who is Paul Graham?\"),\n",
+    "]\n",
+    "\n",
+    "resp = await llm.achat(messages)\n",
+    "print(resp)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f25ccf92",
+   "metadata": {},
+   "source": [
+    "## Vertex AI Support\n",
+    "\n",
+    "By providing the `region` and `project_id` parameters (either through environment variables or directly), you can use an Anthropic model through Vertex AI."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "10b63238-8d01-48f7-b2ec-a56d23fec172",
+   "id": "c4a9db35",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.llms.anthropic import Anthropic\n",
+    "import os\n",
     "\n",
-    "llm = Anthropic(model=\"claude-3-opus-20240229\")\n",
-    "messages = [\n",
-    "    ChatMessage(\n",
-    "        role=\"system\", content=\"You are a pirate with a colorful personality\"\n",
-    "    ),\n",
-    "    ChatMessage(role=\"user\", content=\"Tell me a story\"),\n",
-    "]\n",
-    "resp = llm.stream_chat(messages)"
+    "os.environ[\"ANTHROPIC_PROJECT_ID\"] = \"YOUR PROJECT ID HERE\"\n",
+    "os.environ[\"ANTHROPIC_REGION\"] = \"YOUR PROJECT REGION HERE\""
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d90ec6f2-8f49-4f96-9290-c7ed9bb8ba45",
+   "cell_type": "markdown",
+   "id": "ed545c13",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "*clears throat and speaks in a gruff, piratey voice* \n",
-      "\n",
-      "Aye, gather 'round me hearties and I'll spin ye a yarn of adventure on the high seas! \n",
-      "\n",
-      "'Twas a dark and stormy night, the kind where the wind howls like a banshee and the waves crash over the deck. Me and me crew were sailin' the Caribbean, searchin' for treasure and glory.\n",
-      "\n",
-      "Suddenly, the lookout cried \"Ship ahoy!\" and sure enough, a Spanish galleon was bearin' down on us, her decks bristlin' with cannons. The scurvy dogs wanted our gold, but I'd sooner walk the plank than surrender!\n",
-      "\n",
-      "\"All hands to battle stations!\" I bellowed. \"Ready the cannons and prepare to board!\" \n",
-      "\n",
-      "A mighty battle erupted, cannons boomin' and swords clashin'. We swung over on ropes and fought the Spaniards hand-to-hand on the pitchin' and rollin' deck. Me cutlass was a blur as I dueled their captain, a big brute with a wicked scar.\n",
-      "\n",
-      "Finally, I drove me blade into that bilge rat's black heart and he fell dead at me feet. His crew surrendered and we took their ship as a prize. In the hold, we found chests overflowing with gold doubloons and jewels - a king's ransom! \n",
-      "\n",
-      "We sailed off into the sunset, our pirate flag snappin' in the breeze, flush with coin and the thrill of victory. And that, me buckos, is a taste of the pirate life! Now who wants some grog?\n",
-      "\n",
-      "*laughs heartily*"
-     ]
-    }
-   ],
    "source": [
-    "for r in resp:\n",
-    "    print(r.delta, end=\"\")"
+    "Do keep in mind that setting region and project_id here will make Anthropic use the Vertex AI client"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "d4b6ea50-d777-4174-a326-6e4e57b9ea8b",
+   "id": "cf8b6f49",
    "metadata": {},
    "source": [
-    "## Configure Model"
+    "## Bedrock Support\n",
+    "\n",
+    "LlamaIndex also supports Anthropic models through AWS Bedrock."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1ce3de8d-287e-402d-936f-64a106c8fac2",
+   "id": "8df734a4",
    "metadata": {},
    "outputs": [],
    "source": [
     "from llama_index.llms.anthropic import Anthropic\n",
     "\n",
-    "llm = Anthropic(model=\"claude-3-sonnet-20240229\")"
+    "# Note: this assumes you have standard AWS credentials configured in your environment\n",
+    "llm = Anthropic(\n",
+    "    model=\"anthropic.claude-3-7-sonnet-20250219-v1:0\",\n",
+    "    aws_region=\"us-east-1\",\n",
+    ")\n",
+    "\n",
+    "resp = llm.complete(\"Who is Paul Graham?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37a35c7e",
+   "metadata": {},
+   "source": [
+    "## Multi-Modal Support\n",
+    "\n",
+    "Using `ChatMessage` objects, you can pass in images and text to the LLM."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f1727a8f-7653-42e9-a27b-4826e93ddfe5",
+   "id": "36874ac9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "resp = llm.stream_complete(\"Paul Graham is \")"
+    "!wget https://cdn.pixabay.com/photo/2021/12/12/20/00/play-6865967_640.jpg -O image.jpg"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8b53c20f-bb17-4265-8fd0-8b5921a16495",
+   "id": "152e2e9f",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Paul Graham is a computer scientist, entrepreneur, venture capitalist, and author. He is best known for the following:\n",
-      "\n",
-      "1. Co-founding Y Combinator: Y Combinator is a prominent startup accelerator based in Silicon Valley. It has funded and helped launch thousands of startups, including Airbnb, Dropbox, Stripe, and Reddit.\n",
-      "\n",
-      "2. Writing essays on startups and technology: Graham has written numerous influential essays on topics related to startups, programming, and entrepreneurship. His essays are widely read and have helped shape the thinking of many entrepreneurs and technologists.\n",
-      "\n",
-      "3. Developing the programming language Arc: In the early 2000s, Graham developed a new programming language called Arc, which was designed to be a more powerful and expressive dialect of Lisp.\n",
-      "\n",
-      "4. Advocating for the use of Lisp and functional programming: Graham is a strong proponent of the Lisp programming language and functional programming paradigms. He has written extensively about the benefits of these approaches and has influenced many programmers to explore them.\n",
-      "\n",
-      "5. Authoring books: Graham has authored several books, including \"Hackers & Painters: Big Ideas from the Computer Age\" (2004), \"On Lisp\" (1993), and \"ANSI Common Lisp\" (1995).\n",
-      "\n",
-      "6. Investing in startups: Through Y Combinator and his own investments, Graham has invested in and advised numerous successful startups, helping to shape the technology industry.\n",
-      "\n",
-      "Overall, Paul Graham is widely respected in the technology and startup communities for his contributions as a programmer, writer, investor, and advocate for innovative ideas and approaches."
+      "assistant: The image shows four wooden dice arranged on a dark blue or black textured surface. The dice appear to be made of light-colored wood with black dots representing the numbers. Each die shows a different face value, with various combinations of dots visible. The dice have a natural wooden finish and the classic cubic shape with rounded edges that's typical of gaming dice. This type of dice would commonly be used for board games, tabletop games, or various games of chance.\n"
      ]
     }
    ],
    "source": [
-    "for r in resp:\n",
-    "    print(r.delta, end=\"\")"
+    "from llama_index.core.llms import ChatMessage, TextBlock, ImageBlock\n",
+    "from llama_index.llms.anthropic import Anthropic\n",
+    "\n",
+    "llm = Anthropic(model=\"claude-3-7-sonnet-latest\")\n",
+    "\n",
+    "messages = [\n",
+    "    ChatMessage(\n",
+    "        role=\"user\",\n",
+    "        blocks=[\n",
+    "            ImageBlock(path=\"image.jpg\"),\n",
+    "            TextBlock(text=\"What is in this image?\"),\n",
+    "        ],\n",
+    "    )\n",
+    "]\n",
+    "\n",
+    "resp = llm.chat(messages)\n",
+    "print(resp)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "5152a2b4-78e6-47a5-933d-f5186ec0f775",
+   "id": "03aed884",
    "metadata": {},
    "source": [
-    "### Async"
+    "## Prompt Caching\n",
+    "\n",
+    "Anthropic models support the idea of prompt cahcing -- wherein if a prompt is repeated multiple times, or the start of a prompt is repeated, the LLM can reuse pre-calculated attention results to speed up the response and lower costs.\n",
+    "\n",
+    "To enable prompt caching, you can set `cache_control` on your `ChatMessage` objects, or set `cache_idx` on the LLM to always cache the first X messages (with -1 being all messages)."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7df04c0e-43ee-4176-9aad-94781d0ed36d",
+   "id": "d1027338",
    "metadata": {},
    "outputs": [],
    "source": [
+    "from llama_index.core.llms import ChatMessage\n",
     "from llama_index.llms.anthropic import Anthropic\n",
     "\n",
-    "llm = Anthropic(\"claude-3-sonnet-20240229\")\n",
-    "resp = await llm.acomplete(\"Paul Graham is \")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "be09c52b-604a-4f05-8f93-36e6ea882ff5",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Paul Graham is a computer scientist, entrepreneur, venture capitalist, and author. He is best known for the following:\n",
-      "\n",
-      "1. Co-founding Y Combinator: Y Combinator is a prominent startup accelerator based in Silicon Valley. It has funded and helped launch many successful startups, including Airbnb, Dropbox, Stripe, and Reddit.\n",
-      "\n",
-      "2. Writing essays on startups and technology: Graham has written numerous influential essays on topics related to startups, programming, and entrepreneurship. His essays are widely read and have helped shape the thinking of many entrepreneurs and technologists.\n",
-      "\n",
-      "3. Developing the programming language Arc: Graham designed and developed the programming language Arc, which was intended to be a more powerful and expressive dialect of Lisp.\n",
-      "\n",
-      "4. Authoring books: He has written several books, including \"Hackers & Painters: Big Ideas from the Computer Age,\" \"ANSI Common Lisp,\" and \"On Lisp.\"\n",
-      "\n",
-      "5. Founding Viaweb: In the 1990s, Graham co-founded Viaweb, one of the earliest web-based application software companies. Viaweb was later acquired by Yahoo! in 1998.\n",
-      "\n",
-      "Graham is widely respected in the technology and startup communities for his insights, writings, and contributions to the field of computer science and entrepreneurship.\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(resp)"
+    "llm = Anthropic(model=\"claude-3-7-latest\")\n",
+    "\n",
+    "# cache individual message(s)\n",
+    "messages = [\n",
+    "    ChatMessage(\n",
+    "        role=\"user\",\n",
+    "        content=\"<some very long prompt>\",\n",
+    "        additional_kwargs={\"cache_control\": {\"type\": \"ephemeral\"}},\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "resp = llm.chat(messages)\n",
+    "\n",
+    "# cache first X messages (with -1 being all messages)\n",
+    "llm = Anthropic(model=\"claude-3-7-latest\", cache_idx=-1)\n",
+    "\n",
+    "resp = llm.chat(messages)"
    ]
   },
   {
@@ -642,6 +623,336 @@
     "\n",
     "restaurant_obj"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc778794",
+   "metadata": {},
+   "source": [
+    "## Model Thinking\n",
+    "\n",
+    "With `claude-3.7 Sonnet`, you can enable the model to \"think\" harder about a task, generating a chain-of-thought response before writing out the final answer.\n",
+    "\n",
+    "You can enable this by passing in the `thinking_dict` parameter to the constructor, specififying the amount of tokens to reserve for the thinking process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bbf4c90f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.anthropic import Anthropic\n",
+    "from llama_index.core.llms import ChatMessage\n",
+    "\n",
+    "llm = Anthropic(\n",
+    "    model=\"claude-3-7-sonnet-latest\",\n",
+    "    # max_tokens must be greater than budget_tokens\n",
+    "    max_tokens=64000,\n",
+    "    # temperature must be 1.0 for thinking to work\n",
+    "    temperature=1.0,\n",
+    "    thinking_dict={\"type\": \"enabled\", \"budget_tokens\": 1600},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94018d16",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# Evaluating (1234 * 3421) / (231 + 2341)\n",
+      "\n",
+      "I'll solve this step by step.\n",
+      "\n",
+      "## Step 1: Calculate the numerator (1234 * 3421)\n",
+      "1234 * 3421 = 4,221,514\n",
+      "\n",
+      "## Step 2: Calculate the denominator (231 + 2341)\n",
+      "231 + 2341 = 2,572\n",
+      "\n",
+      "## Step 3: Divide the numerator by the denominator\n",
+      "4,221,514 ÷ 2,572 = 1,641.335...\n",
+      "\n",
+      "Therefore:\n",
+      "(1234 * 3421) / (231 + 2341) = 1,641.335...\n",
+      "\n",
+      "The exact answer is 1,641 + 862/2,572, which can be simplified to 1,641.335...\n",
+      "# Evaluating (1234 * 3421) / (231 + 2341)\n",
+      "\n",
+      "I'll solve this step by step.\n",
+      "\n",
+      "## Step 1: Calculate the numerator (1234 * 3421)\n",
+      "1234 * 3421 = 4,221,514\n",
+      "\n",
+      "## Step 2: Calculate the denominator (231 + 2341)\n",
+      "231 + 2341 = 2,572\n",
+      "\n",
+      "## Step 3: Divide the numerator by the denominator\n",
+      "4,221,514 ÷ 2,572 = 1,641.335...\n",
+      "\n",
+      "Therefore:\n",
+      "(1234 * 3421) / (231 + 2341) = 1,641.335...\n",
+      "\n",
+      "The exact answer is 1,641 + 862/2,572, which can be simplified to 1,641.335...\n"
+     ]
+    }
+   ],
+   "source": [
+    "messages = [\n",
+    "    ChatMessage(role=\"user\", content=\"(1234 * 3421) / (231 + 2341) = ?\")\n",
+    "]\n",
+    "\n",
+    "resp_gen = llm.stream_chat(messages)\n",
+    "\n",
+    "for r in resp_gen:\n",
+    "    print(r.delta, end=\"\")\n",
+    "\n",
+    "print()\n",
+    "print(r.message.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b573bbb6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ErUBCkYIARgCIkA2LmXlUq2Lmkrlw4yPTpMD2I688kow8bnUjgP8DaEg0jXSgnTBjx0MWOJGpxQJA6Y3RVT/fGFm/X8ZDa7JXC0jEgybB8Sb5YUDH8RsEKcaDAFQAYIlE+97QPbA8yIwUaJV4/6oPFzx6PHC8ZZn8P05tcGdcR/Vp1z4mlLmjfaikz3mHzAOvQp1wunx0sa0Kh0TIbmx80VaWeU/RgFk0yIIZmkKXtCVI27VFVu8nw==\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(r.message.additional_kwargs[\"thinking\"][\"signature\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6dc35e06",
+   "metadata": {},
+   "source": [
+    "We can also expose the exact thinking process:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac32910e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I need to calculate (1234 * 3421) / (231 + 2341)\n",
+      "\n",
+      "Let's start by calculating the numerator: 1234 * 3421\n",
+      "1234 * 3421 = (1234 * 3000) + (1234 * 400) + (1234 * 20) + (1234 * 1)\n",
+      "= 3702000 + 493600 + 24680 + 1234\n",
+      "= 4221514\n",
+      "\n",
+      "Now let's calculate the denominator: 231 + 2341\n",
+      "231 + 2341 = 2572\n",
+      "\n",
+      "Finally, let's calculate the division: 4221514 / 2572\n",
+      "\n",
+      "Actually, let me just double-check my calculation of 1234 * 3421.\n",
+      "1234 * 3421 = 1234 * 3421\n",
+      "\n",
+      "Let me do this calculation differently.\n",
+      "   1234\n",
+      "×  3421\n",
+      "------\n",
+      "   1234\n",
+      "  24680\n",
+      " 493600\n",
+      "3702000\n",
+      "------\n",
+      "4221514\n",
+      "\n",
+      "So the numerator is 4221514.\n",
+      "\n",
+      "Now let's calculate the denominator: 231 + 2341 = 2572\n",
+      "\n",
+      "Finally, let's calculate the division: 4221514 / 2572\n",
+      "\n",
+      "4221514 / 2572 = ?\n",
+      "\n",
+      "Let me try long division.\n",
+      "4221514 / 2572 = 1640.94...\n",
+      "\n",
+      "Actually, let me verify this with another approach.\n",
+      "\n",
+      "4221514 / 2572 \n",
+      "≈ 4200000 / 2600 \n",
+      "≈ 1615.38...\n",
+      "\n",
+      "That's not matching my earlier calculation. Let me try the division again.\n",
+      "\n",
+      "4221514 / 2572 \n",
+      "\n",
+      "2572 goes into 4221 about 1.64 times, which is about 1 time.\n",
+      "4221 - 2572 = 1649\n",
+      "Bring down the 5: 16495\n",
+      "2572 goes into 16495 about 6.41 times, which is about 6 times.\n",
+      "16495 - (6 * 2572) = 16495 - 15432 = 1063\n",
+      "Bring down the 1: 10631\n",
+      "2572 goes into 10631 about 4.13 times, which is about 4 times.\n",
+      "10631 - (4 * 2572) = 10631 - 10288 = 343\n",
+      "Bring down the 4: 3434\n",
+      "2572 goes into 3434 about 1.33 times, which is about 1 time.\n",
+      "3434 - 2572 = 862\n",
+      "\n",
+      "Actually, I'm going to try one more approach. I'll use polynomial long division.\n",
+      "4221514 / 2572 = (4221514/2572)\n",
+      "\n",
+      "Let me calculate this directly.\n",
+      "4221514 / 2572 = 1641.3351...\n",
+      "\n",
+      "Let me double-check this by multiplying: 1641.3351 * 2572 ≈ 4221514? Let's see. That's approximately 1641 * 2572 = 4,220,652.\n",
+      "\n",
+      "That seems close enough (1641 * 2572 is a bit less than 4221514, which makes sense since 1641 is a bit less than 1641.3351).\n",
+      "\n",
+      "So our answer is 4221514 / 2572 = 1641.3351...\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(resp.message.additional_kwargs[\"thinking\"][\"thinking\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7b0bbcb",
+   "metadata": {},
+   "source": [
+    "## Tool/Function Calling\n",
+    "\n",
+    "Anthropic supports direct tool/function calling through the API. Using LlamaIndex, we can implement some core agentic tool calling patterns."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f360774",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.tools import FunctionTool\n",
+    "from llama_index.core.llms import ChatMessage\n",
+    "from llama_index.llms.anthropic import Anthropic\n",
+    "from datetime import datetime\n",
+    "\n",
+    "llm = Anthropic(model=\"claude-3-7-sonnet-latest\")\n",
+    "\n",
+    "\n",
+    "def get_current_time() -> dict:\n",
+    "    \"\"\"Get the current time\"\"\"\n",
+    "    return {\"time\": datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")}\n",
+    "\n",
+    "\n",
+    "# uses the tool name, any type annotations, and docstring to describe the tool\n",
+    "tool = FunctionTool.from_defaults(fn=get_current_time)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "306cf697",
+   "metadata": {},
+   "source": [
+    "We can simply do a single pass to call the tool and get the result:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a31eb615",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'time': '2025-03-06 12:36:25'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "resp = llm.predict_and_call([tool], \"What is the current time?\")\n",
+    "print(resp)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "afd7718d",
+   "metadata": {},
+   "source": [
+    "We can also use lower-level APIs to implement an agentic tool-calling loop!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59cb6fda",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Calling get_current_time with {}\n",
+      "Tool output:  {'time': '2025-03-06 12:43:36'}\n",
+      "Final response:  The current time is 12:43:36 PM on March 6, 2025.\n"
+     ]
+    }
+   ],
+   "source": [
+    "chat_history = [ChatMessage(role=\"user\", content=\"What is the current time?\")]\n",
+    "tools_by_name = {t.metadata.name: t for t in [tool]}\n",
+    "\n",
+    "resp = llm.chat_with_tools([tool], chat_history=chat_history)\n",
+    "tool_calls = llm.get_tool_calls_from_response(\n",
+    "    resp, error_on_no_tool_call=False\n",
+    ")\n",
+    "\n",
+    "if not tool_calls:\n",
+    "    print(resp)\n",
+    "else:\n",
+    "    while tool_calls:\n",
+    "        # add the LLM's response to the chat history\n",
+    "        chat_history.append(resp.message)\n",
+    "\n",
+    "        for tool_call in tool_calls:\n",
+    "            tool_name = tool_call.tool_name\n",
+    "            tool_kwargs = tool_call.tool_kwargs\n",
+    "\n",
+    "            print(f\"Calling {tool_name} with {tool_kwargs}\")\n",
+    "            tool_output = tool.call(**tool_kwargs)\n",
+    "            print(\"Tool output: \", tool_output)\n",
+    "            chat_history.append(\n",
+    "                ChatMessage(\n",
+    "                    role=\"tool\",\n",
+    "                    content=str(tool_output),\n",
+    "                    # most LLMs like Anthropic, OpenAI, etc. need to know the tool call id\n",
+    "                    additional_kwargs={\"tool_call_id\": tool_call.tool_id},\n",
+    "                )\n",
+    "            )\n",
+    "\n",
+    "            resp = llm.chat_with_tools([tool], chat_history=chat_history)\n",
+    "            tool_calls = llm.get_tool_calls_from_response(\n",
+    "                resp, error_on_no_tool_call=False\n",
+    "            )\n",
+    "    print(\"Final response: \", resp.message.content)"
+   ]
   }
  ],
  "metadata": {
diff --git a/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/base.py b/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/base.py
index 278388d355..3d3722ec22 100644
--- a/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/base.py
+++ b/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/base.py
@@ -53,8 +53,11 @@ from anthropic.types import (
     ContentBlockStopEvent,
     TextBlock,
     TextDelta,
+    ThinkingBlock,
+    ThinkingDelta,
+    ToolUseBlock,
+    SignatureDelta,
 )
-from anthropic.types.tool_use_block import ToolUseBlock
 
 if TYPE_CHECKING:
     from llama_index.core.tools.types import BaseTool
@@ -119,6 +122,21 @@ class Anthropic(FunctionCallingLLM):
     additional_kwargs: Dict[str, Any] = Field(
         default_factory=dict, description="Additional kwargs for the anthropic API."
     )
+    cache_idx: Optional[int] = Field(
+        default=None,
+        description=(
+            "Set the cache_control for every message up to and including this index. "
+            "Set to -1 to cache all messages. "
+            "Set to None to disable caching."
+        ),
+    )
+    thinking_dict: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=(
+            "Configure thinking controls for the LLM. See the Anthropic API docs for more details. "
+            "For example: thinking_dict={'type': 'enabled', 'budget_tokens': 16000}"
+        ),
+    )
 
     _client: Union[
         anthropic.Anthropic, anthropic.AnthropicVertex, anthropic.AnthropicBedrock
@@ -149,6 +167,8 @@ class Anthropic(FunctionCallingLLM):
         region: Optional[str] = None,
         project_id: Optional[str] = None,
         aws_region: Optional[str] = None,
+        cache_idx: Optional[int] = None,
+        thinking_dict: Optional[Dict[str, Any]] = None,
     ) -> None:
         additional_kwargs = additional_kwargs or {}
         callback_manager = callback_manager or CallbackManager([])
@@ -167,6 +187,8 @@ class Anthropic(FunctionCallingLLM):
             completion_to_prompt=completion_to_prompt,
             pydantic_program_mode=pydantic_program_mode,
             output_parser=output_parser,
+            cache_idx=cache_idx,
+            thinking_dict=thinking_dict,
         )
 
         if region and project_id and not aws_region:
@@ -240,27 +262,38 @@ class Anthropic(FunctionCallingLLM):
         }
 
     def _get_all_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
-        return {
+        kwargs = {
             **self._model_kwargs,
             **kwargs,
         }
 
-    def _get_content_and_tool_calls(
+        if self.thinking_dict and "thinking" not in kwargs:
+            kwargs["thinking"] = self.thinking_dict
+
+        return kwargs
+
+    def _get_content_and_tool_calls_and_thinking(
         self, response: Any
-    ) -> Tuple[str, List[ToolUseBlock]]:
+    ) -> Tuple[str, List[Dict[str, Any]], Dict[str, Any]]:
         tool_calls = []
+        thinking = None
         content = ""
         for content_block in response.content:
             if isinstance(content_block, TextBlock):
                 content += content_block.text
+            # this assumes a single thinking block, which as of 2025-03-06, is always true
+            elif isinstance(content_block, ThinkingBlock):
+                thinking = content_block.model_dump()
             elif isinstance(content_block, ToolUseBlock):
-                tool_calls.append(content_block.dict())
+                tool_calls.append(content_block.model_dump())
 
-        return content, tool_calls
+        return content, tool_calls, thinking
 
     @llm_chat_callback()
     def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
-        anthropic_messages, system_prompt = messages_to_anthropic_messages(messages)
+        anthropic_messages, system_prompt = messages_to_anthropic_messages(
+            messages, self.cache_idx
+        )
         all_kwargs = self._get_all_kwargs(**kwargs)
 
         response = self._client.messages.create(
@@ -270,13 +303,15 @@ class Anthropic(FunctionCallingLLM):
             **all_kwargs,
         )
 
-        content, tool_calls = self._get_content_and_tool_calls(response)
+        content, tool_calls, thinking = self._get_content_and_tool_calls_and_thinking(
+            response
+        )
 
         return ChatResponse(
             message=ChatMessage(
                 role=MessageRole.ASSISTANT,
                 content=content,
-                additional_kwargs={"tool_calls": tool_calls},
+                additional_kwargs={"tool_calls": tool_calls, "thinking": thinking},
             ),
             raw=dict(response),
         )
@@ -292,7 +327,9 @@ class Anthropic(FunctionCallingLLM):
     def stream_chat(
         self, messages: Sequence[ChatMessage], **kwargs: Any
     ) -> ChatResponseGen:
-        anthropic_messages, system_prompt = messages_to_anthropic_messages(messages)
+        anthropic_messages, system_prompt = messages_to_anthropic_messages(
+            messages, self.cache_idx
+        )
         all_kwargs = self._get_all_kwargs(**kwargs)
 
         response = self._client.messages.create(
@@ -301,6 +338,8 @@ class Anthropic(FunctionCallingLLM):
 
         def gen() -> ChatResponseGen:
             content = ""
+            content_delta = ""
+            thinking = None
             cur_tool_calls: List[ToolUseBlock] = []
             cur_tool_call: Optional[ToolUseBlock] = None
             cur_tool_json: str = ""
@@ -310,6 +349,24 @@ class Anthropic(FunctionCallingLLM):
                     if isinstance(r.delta, TextDelta):
                         content_delta = r.delta.text
                         content += content_delta
+                    elif isinstance(r.delta, SignatureDelta):
+                        if thinking is None:
+                            thinking = ThinkingBlock(
+                                signature=r.delta.signature,
+                                thinking="",
+                                type="thinking",
+                            )
+                        else:
+                            thinking.signature += r.delta.signature
+                    elif isinstance(r.delta, ThinkingDelta):
+                        if thinking is None:
+                            thinking = ThinkingBlock(
+                                signature="",
+                                thinking=r.delta.thinking,
+                                type="thinking",
+                            )
+                        else:
+                            thinking.thinking += r.delta.thinking
                     else:
                         if not isinstance(cur_tool_call, ToolUseBlock):
                             raise ValueError("Tool call not started")
@@ -325,12 +382,14 @@ class Anthropic(FunctionCallingLLM):
                         tool_calls_to_send = [*cur_tool_calls, cur_tool_call]
                     else:
                         tool_calls_to_send = cur_tool_calls
+
                     yield ChatResponse(
                         message=ChatMessage(
                             role=role,
                             content=content,
                             additional_kwargs={
-                                "tool_calls": [t.dict() for t in tool_calls_to_send]
+                                "tool_calls": [t.dict() for t in tool_calls_to_send],
+                                "thinking": thinking.model_dump() if thinking else None,
                             },
                         ),
                         delta=content_delta,
@@ -357,7 +416,9 @@ class Anthropic(FunctionCallingLLM):
     async def achat(
         self, messages: Sequence[ChatMessage], **kwargs: Any
     ) -> ChatResponse:
-        anthropic_messages, system_prompt = messages_to_anthropic_messages(messages)
+        anthropic_messages, system_prompt = messages_to_anthropic_messages(
+            messages, self.cache_idx
+        )
         all_kwargs = self._get_all_kwargs(**kwargs)
 
         response = await self._aclient.messages.create(
@@ -367,13 +428,15 @@ class Anthropic(FunctionCallingLLM):
             **all_kwargs,
         )
 
-        content, tool_calls = self._get_content_and_tool_calls(response)
+        content, tool_calls, thinking = self._get_content_and_tool_calls_and_thinking(
+            response
+        )
 
         return ChatResponse(
             message=ChatMessage(
                 role=MessageRole.ASSISTANT,
                 content=content,
-                additional_kwargs={"tool_calls": tool_calls},
+                additional_kwargs={"tool_calls": tool_calls, "thinking": thinking},
             ),
             raw=dict(response),
         )
@@ -389,7 +452,9 @@ class Anthropic(FunctionCallingLLM):
     async def astream_chat(
         self, messages: Sequence[ChatMessage], **kwargs: Any
     ) -> ChatResponseAsyncGen:
-        anthropic_messages, system_prompt = messages_to_anthropic_messages(messages)
+        anthropic_messages, system_prompt = messages_to_anthropic_messages(
+            messages, self.cache_idx
+        )
         all_kwargs = self._get_all_kwargs(**kwargs)
 
         response = await self._aclient.messages.create(
@@ -398,6 +463,8 @@ class Anthropic(FunctionCallingLLM):
 
         async def gen() -> ChatResponseAsyncGen:
             content = ""
+            content_delta = ""
+            thinking = None
             cur_tool_calls: List[ToolUseBlock] = []
             cur_tool_call: Optional[ToolUseBlock] = None
             cur_tool_json: str = ""
@@ -407,6 +474,24 @@ class Anthropic(FunctionCallingLLM):
                     if isinstance(r.delta, TextDelta):
                         content_delta = r.delta.text
                         content += content_delta
+                    elif isinstance(r.delta, SignatureDelta):
+                        if thinking is None:
+                            thinking = ThinkingBlock(
+                                signature=r.delta.signature,
+                                thinking="",
+                                type="thinking",
+                            )
+                        else:
+                            thinking.signature += r.delta.signature
+                    elif isinstance(r.delta, ThinkingDelta):
+                        if thinking is None:
+                            thinking = ThinkingBlock(
+                                signature="",
+                                thinking=r.delta.thinking,
+                                type="thinking",
+                            )
+                        else:
+                            thinking.thinking += r.delta.thinking
                     else:
                         if not isinstance(cur_tool_call, ToolUseBlock):
                             raise ValueError("Tool call not started")
@@ -427,7 +512,8 @@ class Anthropic(FunctionCallingLLM):
                             role=role,
                             content=content,
                             additional_kwargs={
-                                "tool_calls": [t.dict() for t in tool_calls_to_send]
+                                "tool_calls": [t.dict() for t in tool_calls_to_send],
+                                "thinking": thinking.model_dump() if thinking else None,
                             },
                         ),
                         delta=content_delta,
diff --git a/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/utils.py b/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/utils.py
index ffad643b9d..2459e4990c 100644
--- a/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/utils.py
+++ b/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/utils.py
@@ -2,7 +2,7 @@
 Utility functions for the Anthropic SDK LLM integration.
 """
 
-from typing import Any, Dict, Sequence, Tuple
+from typing import Any, Dict, Sequence, Tuple, Optional
 
 from llama_index.core.base.llms.types import (
     ChatMessage,
@@ -15,6 +15,7 @@ from llama_index.core.base.llms.types import (
 from anthropic.types import (
     MessageParam,
     TextBlockParam,
+    ThinkingBlockParam,
     ImageBlockParam,
     CacheControlEphemeralParam,
 )
@@ -127,6 +128,7 @@ def __merge_common_role_msgs(
 
 def messages_to_anthropic_messages(
     messages: Sequence[ChatMessage],
+    cache_idx: Optional[int] = None,
 ) -> Tuple[Sequence[MessageParam], str]:
     """Converts a list of generic ChatMessages to anthropic messages.
 
@@ -140,7 +142,11 @@ def messages_to_anthropic_messages(
     """
     anthropic_messages = []
     system_prompt = []
-    for message in messages:
+    for idx, message in enumerate(messages):
+        # inject cache_control for all messages up to and including the cache_idx
+        if cache_idx is not None and (idx <= cache_idx or cache_idx == -1):
+            message.additional_kwargs["cache_control"] = {"type": "ephemeral"}
+
         if message.role == MessageRole.SYSTEM:
             for block in message.blocks:
                 if isinstance(block, TextBlock) and block.text:
@@ -211,19 +217,23 @@ def messages_to_anthropic_messages(
                 content=content,
             )
             anthropic_messages.append(anth_message)
+
     return __merge_common_role_msgs(anthropic_messages), system_prompt
 
 
 def _text_block_to_anthropic_message(
     block: TextBlock, kwargs: dict[str, Any]
 ) -> TextBlockParam:
-    if "cache_control" in kwargs:
+    if "thinking" in kwargs and kwargs["thinking"] is not None:
+        return ThinkingBlockParam(**kwargs["thinking"])
+    elif "cache_control" in kwargs:
         return TextBlockParam(
             text=block.text,
             type="text",
             cache_control=CacheControlEphemeralParam(type="ephemeral"),
         )
-    return TextBlockParam(text=block.text, type="text")
+    else:
+        return TextBlockParam(text=block.text, type="text")
 
 
 # Function used in bedrock
diff --git a/llama-index-integrations/llms/llama-index-llms-anthropic/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-anthropic/pyproject.toml
index 666de47c4f..d0f525e92d 100644
--- a/llama-index-integrations/llms/llama-index-llms-anthropic/pyproject.toml
+++ b/llama-index-integrations/llms/llama-index-llms-anthropic/pyproject.toml
@@ -27,11 +27,11 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-llms-anthropic"
 readme = "README.md"
-version = "0.6.9"
+version = "0.6.10"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
-anthropic = {extras = ["bedrock", "vertex"], version = ">=0.41.0"}
+anthropic = {extras = ["bedrock", "vertex"], version = ">=0.49.0"}
 llama-index-core = "^0.12.5"
 
 [tool.poetry.group.dev.dependencies]
-- 
GitLab