bluebull - Hackathon: Benchmarking Small Language Models in the Real World
AI Tinkerers - Paris
Hackathon Showcase Best Startup Winner

bluebull

Team consisting of a Ublo software engineer and 42 Paris alumna specializing in React, TypeScript, and Python-based AI agentic workflows.

1 member

Bluebull is a text-to-Polars code generation system built on Qwen2.5-Coder-7B-Instruct, a code-specialized 7B model quantized to 4-bit (MLX on Apple Silicon, bitsandbytes NF4 on CUDA) to minimize VRAM usage to ~4GB.

Correctness (N):
A structured system prompt encodes 8 Polars-specific rules (e.g., .height not .count(), .dt.month() with parentheses, no ascending=True) alongside 5 hand-crafted few-shot examples targeting common LLM mistakes. A self-repair loop re-prompts the model with the error message on execution failure, automatically recovering many edge cases.

Latency (T):
Generation is capped at 200 tokens. The model loads in a background thread so the server responds immediately, and 4-bit quantization reduces both load time and memory bandwidth. The system scored 16/16 correct on the local eval set in ~39 seconds (N/T ≈ 0.41).

Stack:
FastAPI + Uvicorn, Polars 1.40, HuggingFace Transformers, MLX-LM, bitsandbytes, Python 3.12. Safe code execution with a 10-second timeout and restricted builtins.

from src.model import CodeGenerator
from src.evaluator import evaluate

if name == “main”:
gen = CodeGenerator()
evaluate(
eval_set_path=”data/eval_set.json”,
parquet_path=”data/sales.parquet”,
generator=gen,
max_retries=1,
)

import platform
import sys

def _is_apple_silicon() -> bool:
return sys.platform == “darwin” and platform.machine() == “arm64”

class CodeGenerator:
def init(self, model_name: str | None = None):
if _is_apple_silicon():
self._init_mlx(model_name or “mlx-community/Qwen2.5-Coder-7B-Instruct-4bit”)
else:
self._init_transformers(model_name or “Qwen/Qwen2.5-Coder-7B-Instruct”)

def _init_mlx(self, model_name: str) -> None:  
    from mlx_lm import load  
    self._backend = "mlx"  
    print(f"Loading {model_name} (MLX)...")  
    self.model, self.tokenizer = load(model_name)  
    print("Model loaded.")  
  
def _init_transformers(self, model_name: str) -> None:  
    import torch  
    from transformers import AutoModelForCausalLM, AutoTokenizer  
    self._backend = "transformers"  
    print(f"Loading {model_name} (transformers)...")  
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)  
    load_kwargs: dict = {"device_map": "auto"}  
    try:  
        from transformers import BitsAndBytesConfig  
        load_kwargs["quantization_config"] = BitsAndBytesConfig(  
            load_in_4bit=True,  
            bnb_4bit_compute_dtype=torch.bfloat16,  
            bnb_4bit_quant_type="nf4",  
        )  
        print("Using 4-bit quantization (bitsandbytes).")  
    except (ImportError, Exception):  
        load_kwargs["torch_dtype"] = torch.float16  
        print("bitsandbytes unavailable, loading in float16.")  
    self.model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)  
    print("Model loaded.")  
  
def generate(self, prompt: str, max_tokens: int = 200) -> str:  
    if self._backend == "mlx":  
        return self._generate_mlx(prompt, max_tokens)  
    return self._generate_transformers(prompt, max_tokens)  
  
def _generate_mlx(self, prompt: str, max_tokens: int) -> str:  
    from mlx_lm import generate as mlx_generate  
    messages = [{"role": "user", "content": prompt}]  
    formatted = self.tokenizer.apply_chat_template(  
        messages, tokenize=False, add_generation_prompt=True  
    )  
    output = mlx_generate(  
        self.model, self.tokenizer, prompt=formatted, max_tokens=max_tokens, verbose=False  
    )  
    return (  
        output.strip()  
        .replace("<|im_end|>", "")  
        .replace("<|endoftext|>", "")  
        .strip()  
    )  
  
def _generate_transformers(self, prompt: str, max_tokens: int) -> str:  
    import torch  
    messages = [{"role": "user", "content": prompt}]  
    text = self.tokenizer.apply_chat_template(  
        messages, tokenize=False, add_generation_prompt=True  
    )  
    inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)  
    with torch.no_grad():  
        out = self.model.generate(  
            **inputs,  
            max_new_tokens=max_tokens,  
            do_sample=False,  
            temperature=None,  
            top_p=None,  
            pad_token_id=self.tokenizer.eos_token_id,  
        )  
    new_tokens = out[0][inputs["input_ids"].shape[1]:]  
    return self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()  

from src.model import CodeGenerator
from src.evaluator import evaluate

if name == “main”:
gen = CodeGenerator()
evaluate(
eval_set_path=”data/eval_set.json”,
parquet_path=”data/sales.parquet”,
generator=gen,
max_retries=1,
)

SYSTEM_INSTRUCTION = “"”Generate ONLY valid Python Polars code.
df is loaded, pl is imported. No markdown, no explanation, no print.
Assign the final result to a variable named result.

Rules:

  • Scalars: use .item() after .select() or .sort().head(1)[col].item()
  • Row count: .height
  • Date methods need parens: .dt.month(), .dt.year()
  • Membership: .is_in([…]) not .isin()
  • Sort DESCENDING (highest first): .sort(‘col’, descending=True)
  • NEVER use ascending=True — that argument does not exist in Polars
  • String equality in filter: pl.col(‘x’) == ‘value’
    “””

FEW_SHOTS = [
{
“schema”: “df: product (str), revenue (f64), country (str)”,
“question”: “What is the total revenue in France?”,
“code”: “result = df.filter(pl.col(‘country’) == ‘France’).select(pl.col(‘revenue’).sum()).item()”,
},
{
“schema”: “df: product (str), revenue (f64)”,
“question”: “Which product has the highest total revenue?”,
“code”: “result = df.group_by(‘product’).agg(pl.col(‘revenue’).sum()).sort(‘revenue’, descending=True).head(1)[‘product’].item()”,
},
{
“schema”: “df: country (str), quantity (i64)”,
“question”: “Which country has the lowest average quantity?”,
“code”: “result = df.group_by(‘country’).agg(pl.col(‘quantity’).mean()).sort(‘quantity’).head(1)[‘country’].item()”,
},
{
“schema”: “df: date (date), revenue (f64)”,
“question”: “Which month number has the highest total revenue?”,
“code”: “result = df.group_by(pl.col(‘date’).dt.month().alias(‘month’)).agg(pl.col(‘revenue’).sum()).sort(‘revenue’, descending=True).head(1)[‘month’].item()”,
},
{
“schema”: “df: date (date), amount (f64)”,
“question”: “How many rows are in months 1 through 3?”,
“code”: “result = df.filter(pl.col(‘date’).dt.month().is_in([1, 2, 3])).height”,
},
]

def build_prompt(schema: str, question: str) -> str:
shots = “\n\n”.join(
f”Schema: {s[‘schema’]}\nQ: {s[‘question’]}\nCode: {s[‘code’]}”
for s in FEW_SHOTS
)
return (
f”{SYSTEM_INSTRUCTION}\n\n”
f”Examples:\n{shots}\n\n”
f”Schema: {schema}\nQ: {question}\nCode:”
)

Mistral AI claude code / cursor