Skip to content

parkervg/blendsql

Repository files navigation


blendsql

SQL 🤝 LLMs

Check out our online documentation for a more comprehensive overview.


💻 Installation

pip install blendsql

⭐ Quickstart

import pandas as pd

from blendsql import BlendSQL
from blendsql.models import TransformersLLM, LiteLLM

USE_LOCAL_CONSTRAINED_MODEL = False

# Load model, either a local transformers model, or remote provider via LiteLLM
if USE_LOCAL_CONSTRAINED_MODEL:
    model = TransformersLLM(
        "meta-llama/Llama-3.2-3B-Instruct", config={"device_map": "auto"}
    )  # Local models enable BlendSQL's predicate-guided constrained decoding
else:
    model = LiteLLM("openai/gpt-4o-mini")

# Prepare our BlendSQL connection
bsql = BlendSQL(
    {
        "People": pd.DataFrame(
            {
                "Name": [
                    "George Washington",
                    "John Adams",
                    "Thomas Jefferson",
                    "James Madison",
                    "James Monroe",
                    "Alexander Hamilton",
                    "Sabrina Carpenter",
                    "Charli XCX",
                    "Elon Musk",
                    "Michelle Obama",
                    "Elvis Presley",
                ],
                "Known_For": [
                    "Established federal government, First U.S. President",
                    "XYZ Affair, Alien and Sedition Acts",
                    "Louisiana Purchase, Declaration of Independence",
                    "War of 1812, Constitution",
                    "Monroe Doctrine, Missouri Compromise",
                    "Created national bank, Federalist Papers",
                    "Nonsense, Emails I Cant Send, Mean Girls musical",
                    "Crash, How Im Feeling Now, Boom Clap",
                    "Tesla, SpaceX, Twitter/X acquisition",
                    "Lets Move campaign, Becoming memoir",
                    "14 Grammys, King of Rock n Roll",
                ],
            }
        ),
        "Eras": pd.DataFrame({"Years": ["1700-1800", "1800-1900", "1900-2000", "2000-Now"]}),
    },
    model=model,
    verbose=True,
)

smoothie = bsql.execute(
    """
    SELECT * FROM People P
    WHERE P.Name IN {{
        LLMQA('First 3 presidents of the U.S?', quantifier='{3}')
    }}
    """,
    infer_gen_constraints=True,
)

print(smoothie.df)
# ┌───────────────────┬───────────────────────────────────────────────────────┐
# │ Name              │ Known_For                                             │
# ├───────────────────┼───────────────────────────────────────────────────────┤
# │ George Washington │ Established federal government, First U.S. Preside... │
# │ John Adams        │ XYZ Affair, Alien and Sedition Acts                   │
# │ Thomas Jefferson  │ Louisiana Purchase, Declaration of Independence       │
# └───────────────────┴───────────────────────────────────────────────────────┘
print(smoothie.summary())
# ┌────────────┬──────────────────────┬─────────────────┬─────────────────────┐
# │   Time (s) │   # Generation Calls │   Prompt Tokens │   Completion Tokens │
# ├────────────┼──────────────────────┼─────────────────┼─────────────────────┤
# │    1.25158 │                    1 │             296 │                  16 │
# └────────────┴──────────────────────┴─────────────────┴─────────────────────┘


smoothie = bsql.execute(
    """
    SELECT GROUP_CONCAT(Name, ', ') AS 'Names',
    {{
        LLMMap(
            'In which time period was this person born?',
            p.Name,
            options=Eras.Years
        )
    }} AS Born
    FROM People p
    GROUP BY Born
    """,
)

print(smoothie.df)
# ┌───────────────────────────────────────────────────────┬───────────┐
# │ Names                                                 │ Born      │
# ├───────────────────────────────────────────────────────┼───────────┤
# │ George Washington, John Adams, Thomas Jefferson, J... │ 1700-1800 │
# │ Sabrina Carpenter, Charli XCX, Elon Musk, Michelle... │ 2000-Now  │
# │ Elvis Presley                                         │ 1900-2000 │
# └───────────────────────────────────────────────────────┴───────────┘
print(smoothie.summary())
# ┌────────────┬──────────────────────┬─────────────────┬─────────────────────┐
# │   Time (s) │   # Generation Calls │   Prompt Tokens │   Completion Tokens │
# ├────────────┼──────────────────────┼─────────────────┼─────────────────────┤
# │    1.03858 │                    2 │             544 │                  75 │
# └────────────┴──────────────────────┴─────────────────┴─────────────────────┘

smoothie = bsql.execute("""
    SELECT {{
        LLMQA(
            'Describe BlendSQL in 50 words.',
            context=(
                SELECT content[0:5000] AS "README"
                FROM read_text('https://raw.githubusercontent.com/parkervg/blendsql/main/README.md')
            )
        )
    }} AS answer
""")

print(smoothie.df)
# ┌─────────────────────────────────────────────────────┐
# │ answer                                              │
# ├─────────────────────────────────────────────────────┤
# │ BlendSQL is a Python library that combines SQL a... │
# └─────────────────────────────────────────────────────┘

print(smoothie.summary())

# ┌────────────┬──────────────────────┬─────────────────┬─────────────────────┐
# │   Time (s) │   # Generation Calls │   Prompt Tokens │   Completion Tokens │
# ├────────────┼──────────────────────┼─────────────────┼─────────────────────┤
# │    4.07617 │                    1 │            1921 │                  50 │
# └────────────┴──────────────────────┴─────────────────┴─────────────────────┘

📰 News

Summary

BlendSQL is a superset of SQL for problem decomposition and hybrid question-answering with LLMs.

As a result, we can Blend together...

  • 🥤 ...operations over heterogeneous data sources (e.g. tables, text, images)
  • 🥤 ...the structured & interpretable reasoning of SQL with the generalizable reasoning of LLMs

comparison

Now, the user is given the control to oversee all calls (LLM + SQL) within a unified query language.

Features

  • Supports many DBMS 💾
    • SQLite, PostgreSQL, DuckDB, Pandas (aka duckdb in a trenchcoat)
  • Supports local & remote models ✨
    • Transformers, OpenAI, Anthropic, Ollama, and 100+ more!
  • Easily extendable to multi-modal usecases 🖼
  • Write your normal queries - smart parsing optimizes what is passed to external functions 🧠
    • Traverses abstract syntax tree with sqlglot to minimize LLM function calls 🌳
  • Constrained decoding with guidance 🚀
    • When using local models, we only generate syntactically valid outputs according to query syntax + database contents
  • LLM function caching, built on diskcache 🔑

Example

For example, imagine we have the following table titled parks, containing info on national parks in the United States.

We can use BlendSQL to build a travel planning LLM chatbot to help us navigate the options below.

Name Image Location Area Recreation Visitors (2022) Description
Death Valley death_valley.jpeg California, Nevada 3,408,395.63 acres (13,793.3 km2) 1,128,862 Death Valley is the hottest, lowest, and driest place in the United States, with daytime temperatures that have exceeded 130 °F (54 °C).
Everglades everglades.jpeg Alaska 7,523,897.45 acres (30,448.1 km2) 9,457 The country's northernmost park protects an expanse of pure wilderness in Alaska's Brooks Range and has no park facilities.
New River Gorge new_river_gorge.jpeg West Virgina 7,021 acres (28.4 km2) 1,593,523 The New River Gorge is the deepest river gorge east of the Mississippi River.
Katmai katmai.jpg Alaska 3,674,529.33 acres (14,870.3 km2) 33,908 This park on the Alaska Peninsula protects the Valley of Ten Thousand Smokes, an ash flow formed by the 1912 eruption of Novarupta.

BlendSQL allows us to ask the following questions by injecting "ingredients", which are callable functions denoted by double curly brackets ({{, }}).

Which parks don't have park facilities?

SELECT "Name", "Description" FROM parks p
  WHERE {{
      LLMMap(
          'Does this location have park facilities?',
          p.Description
      )
  }} = FALSE
Name Description
Everglades The country's northernmost park protects an expanse of pure wilderness in Alaska's Brooks Range and has no park facilities.

What does the largest park in Alaska look like?

SELECT "Name",
{{ImageCaption(Image)}} as "Image Description",
{{
    LLMMap(
        question='Size in km2?',
        values=Area
    )
}} as "Size in km" FROM parks p
WHERE "Location" = 'Alaska'
ORDER BY "Size in km" DESC LIMIT 1
Name Image Description Size in km
Everglades A forest of tall trees with a sunset in the background. 30448.1

Which state is the park in that protects an ash flow?

SELECT Location, Name AS "Park Protecting Ash Flow" FROM parks
    WHERE Name = {{
      LLMQA(
        'Which park protects an ash flow?',
        context=(SELECT Name, Description FROM parks),
        options=Name
      )
  }}
Location Park Protecting Ash Flow
Alaska Katmai

How many parks are located in more than 1 state?

SELECT COUNT(*) FROM parks
    WHERE {{LLMMap('How many states?', Location)}} > 1
Count
1

Give me some info about the park in the state that Sarah Palin was governor of.

SELECT "Name", "Location", "Description" FROM parks
/* Will gather relevant context via WebSearch, and automatically constrain output 
   to one of the values in 'parks.Location'*/
WHERE Location = {{WebSearchQA('Which state was Sarah Palin governor of?')}}
Name Location Description
Everglades Alaska The country's northernmost park protects an expanse of pure wilderness in Alaska's Brooks Range and has no park facilities.
Katmai Alaska This park on the Alaska Peninsula protects the Valley of Ten Thousand Smokes, an ash flow formed by the 1912 eruption of Novarupta.

What's the difference in visitors for those parks with a superlative in their description vs. those without?

SELECT SUM(CAST(REPLACE("Recreation Visitors (2022)", ',', '') AS integer)) AS "Total Visitors",
{{LLMMap('Contains a superlative?', Description, options=('t', 'f'))}} AS "Description Contains Superlative",
GROUP_CONCAT(Name, ', ') AS "Park Names"
FROM parks
GROUP BY "Description Contains Superlative"
Total Visitors Description Contains Superlative Park Names
43365 0 Everglades, Katmai
2722385 1 Death Valley, New River Gorge

Now, we have an intermediate representation for our LLM to use that is explainable, debuggable, and very effective at hybrid question-answering tasks.

For in-depth descriptions of the above queries, check out our documentation.

Citation

@article{glenn2024blendsql,
      title={BlendSQL: A Scalable Dialect for Unifying Hybrid Question Answering in Relational Algebra},
      author={Parker Glenn and Parag Pravin Dakle and Liang Wang and Preethi Raghavan},
      year={2024},
      eprint={2402.17882},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

Few-Shot Prompting

For the LLM-based ingredients in BlendSQL, few-shot prompting can be vital. In LLMMap, LLMQA and LLMJoin, we provide an interface to pass custom few-shot examples and dynamically retrieve those top-k most relevant examples at runtime, given the current inference example.

LLMMap

from blendsql import BlendSQL
from blendsql.ingredients.builtin import LLMMap, DEFAULT_MAP_FEW_SHOT

ingredients = {
    LLMMap.from_args(
        few_shot_examples=[
            *DEFAULT_MAP_FEW_SHOT,
            {
                "question": "Is this a sport?",
                "mapping": {
                    "Soccer": True,
                    "Chair": False,
                    "Banana": False,
                    "Golf": True
                },
                # Below are optional
                "column_name": "Items",
                "table_name": "Table",
                "return_type": "boolean"
            }
        ],
        # Will fetch `k` most relevant few-shot examples using embedding-based retriever
        k=2,
        # How many inference values to pass to model at once
        batch_size=5,
    )
}

bsql = BlendSQL(db, ingredients=ingredients)

LLMQA

from blendsql import BlendSQL
from blendsql.ingredients.builtin import LLMQA, DEFAULT_QA_FEW_SHOT

ingredients = {
    LLMQA.from_args(
        few_shot_examples=[
            *DEFAULT_QA_FEW_SHOT,
            {
                "question": "Which weighs the most?",
                "context": {
                    {
                        "Animal": ["Dog", "Gorilla", "Hamster"],
                        "Weight": ["20 pounds", "350 lbs", "100 grams"]
                    }
                },
                "answer": "Gorilla",
                # Below are optional
                "options": ["Dog", "Gorilla", "Hamster"]
            }
        ],
        # Will fetch `k` most relevant few-shot examples using embedding-based retriever
        k=2,
        # Lambda to turn the pd.DataFrame to a serialized string
        context_formatter=lambda df: df.to_markdown(
            index=False
        )
    )
}

bsql = BlendSQL(db, ingredients=ingredients)

LLMJoin

from blendsql import BlendSQL
from blendsql.ingredients.builtin import LLMJoin, DEFAULT_JOIN_FEW_SHOT

ingredients = {
    LLMJoin.from_args(
        few_shot_examples=[
            *DEFAULT_JOIN_FEW_SHOT,
            {
                "join_criteria": "Join the state to its capital.",
                "left_values": ["California", "Massachusetts", "North Carolina"],
                "right_values": ["Sacramento", "Boston", "Chicago"],
                "mapping": {
                    "California": "Sacramento",
                    "Massachusetts": "Boston",
                    "North Carolina": "-"
                }
            }
        ],
        # Will fetch `k` most relevant few-shot examples using embedding-based retriever
        k=2
    )
}

bsql = BlendSQL(db, ingredients=ingredients)

Acknowledgements

Special thanks to those below for inspiring this project. Definitely recommend checking out the linked work below, and citing when applicable!

Packages

No packages published

Contributors 5

Languages