## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)

## -----------------------------------------------------------------------------
#  library(localLLM)
#  
#  # Load model
#  model <- model_load("Llama-3.2-3B-Instruct-Q5_K_M.gguf", n_gpu_layers = 999)
#  
#  # Create context with batch support
#  ctx <- context_create(
#    model,
#    n_ctx = 2048,
#    n_seq_max = 10  # Allow up to 10 parallel sequences
#  )
#  
#  # Define prompts
#  prompts <- c(
#    "What is the capital of France?",
#    "What is the capital of Germany?",
#    "What is the capital of Italy?"
#  )
#  
#  # Format prompts
#  formatted_prompts <- sapply(prompts, function(p) {
#    messages <- list(
#      list(role = "system", content = "Answer concisely."),
#      list(role = "user", content = p)
#    )
#    apply_chat_template(model, messages)
#  })
#  
#  # Process in parallel
#  results <- generate_parallel(ctx, formatted_prompts, max_tokens = 50)
#  print(results)

## -----------------------------------------------------------------------------
#  results <- generate_parallel(
#    ctx,
#    formatted_prompts,
#    max_tokens = 50,
#    progress = TRUE  # force progress bar even in non-interactive mode
#  )

## -----------------------------------------------------------------------------
#  library(localLLM)
#  
#  # Load sample dataset
#  data("ag_news_sample", package = "localLLM")
#  
#  # Load model
#  model <- model_load("Llama-3.2-3B-Instruct-Q5_K_M.gguf", n_gpu_layers = 999)
#  
#  # Create context (n_seq_max determines max parallel prompts)
#  ctx <- context_create(model, n_ctx = 1048, n_seq_max = 10)
#  
#  # Prepare all prompts
#  all_prompts <- character(nrow(ag_news_sample))
#  
#  for (i in seq_len(nrow(ag_news_sample))) {
#    messages <- list(
#      list(role = "system", content = "You are a helpful assistant."),
#      list(role = "user", content = paste0(
#        "Classify this news article into exactly one category: ",
#        "World, Sports, Business, or Sci/Tech. ",
#        "Respond with only the category name.\n\n",
#        "Title: ", ag_news_sample$title[i], "\n",
#        "Description: ", substr(ag_news_sample$description[i], 1, 100), "\n\n",
#        "Category:"
#      ))
#    )
#    all_prompts[i] <- apply_chat_template(model, messages)
#  }
#  
#  # Process all samples in parallel
#  results <- generate_parallel(
#    context = ctx,
#    prompts = all_prompts,
#    max_tokens = 5,
#    seed = 92092,
#    progress = TRUE,
#    clean = TRUE
#  )
#  
#  # Extract predictions
#  ag_news_sample$LLM_result <- sapply(results, function(x) {
#    trimws(gsub("\\n.*$", "", x))
#  })
#  
#  # Calculate accuracy
#  accuracy <- mean(ag_news_sample$LLM_result == ag_news_sample$class)
#  cat("Accuracy:", round(accuracy * 100, 1), "%\n")

## -----------------------------------------------------------------------------
#  # Sequential approach
#  ag_news_sample$LLM_result <- NA
#  ctx <- context_create(model, n_ctx = 512)
#  
#  system.time({
#    for (i in seq_len(nrow(ag_news_sample))) {
#      formatted_prompt <- all_prompts[i]
#      output <- generate(ctx, formatted_prompt, max_tokens = 5, seed = 92092)
#      ag_news_sample$LLM_result[i] <- trimws(output)
#    }
#  })

## -----------------------------------------------------------------------------
#  # Parallel approach
#  ctx <- context_create(model, n_ctx = 1048, n_seq_max = 10)
#  
#  system.time({
#    results <- generate_parallel(
#      ctx, all_prompts,
#      max_tokens = 5,
#      seed = 92092,
#      progress = TRUE
#    )
#  })

## -----------------------------------------------------------------------------
#  # quick_llama automatically uses parallel mode for vectors
#  prompts <- c(
#    "Summarize: Climate change is affecting global weather patterns...",
#    "Summarize: The stock market reached new highs today...",
#    "Summarize: Scientists discovered a new species of deep-sea fish..."
#  )
#  
#  results <- quick_llama(prompts, max_tokens = 50)
#  print(results)

## -----------------------------------------------------------------------------
#  # If n_ctx = 2048 and n_seq_max = 8
#  # Each sequence gets approximately 2048/8 = 256 tokens
#  
#  # For longer prompts, increase n_ctx proportionally
#  ctx <- context_create(
#    model,
#    n_ctx = 4096,   # Larger context
#    n_seq_max = 8   # 8 parallel sequences
#  )

## -----------------------------------------------------------------------------
#  hw <- hardware_profile()
#  cat("Available RAM:", round(hw$ram_total / 1e9, 1), "GB\n")
#  cat("GPU:", hw$gpu$name, "\n")

## -----------------------------------------------------------------------------
#  results <- generate_parallel(ctx, prompts, max_tokens = 50)
#  
#  # Check for errors
#  for (i in seq_along(results)) {
#    if (grepl("^Error:", results[i])) {
#      cat("Prompt", i, "failed:", results[i], "\n")
#    }
#  }

## -----------------------------------------------------------------------------
#  library(localLLM)
#  
#  # 1. Setup
#  model <- model_load("Llama-3.2-3B-Instruct-Q5_K_M.gguf", n_gpu_layers = 999)
#  ctx <- context_create(model, n_ctx = 2048, n_seq_max = 10)
#  
#  # 2. Prepare prompts
#  data("ag_news_sample", package = "localLLM")
#  
#  prompts <- sapply(seq_len(nrow(ag_news_sample)), function(i) {
#    messages <- list(
#      list(role = "system", content = "Classify news articles."),
#      list(role = "user", content = paste0(
#        "Category (World/Sports/Business/Sci/Tech): ",
#        ag_news_sample$title[i]
#      ))
#    )
#    apply_chat_template(model, messages)
#  })
#  
#  # 3. Process in batches with progress
#  results <- generate_parallel(
#    ctx, prompts,
#    max_tokens = 10,
#    seed = 42,
#    progress = TRUE,
#    clean = TRUE
#  )
#  
#  # 4. Extract and evaluate
#  predictions <- sapply(results, function(x) trimws(gsub("\\n.*", "", x)))
#  accuracy <- mean(predictions == ag_news_sample$class)
#  cat("Accuracy:", round(accuracy * 100, 1), "%\n")

## -----------------------------------------------------------------------------
#  # Fully silent batch pipeline
#  model   <- model_load("model.gguf",          verbosity = 0)
#  ctx     <- context_create(model, n_seq_max = 8, verbosity = 0)
#  results <- generate_parallel(ctx, prompts, max_tokens = 50, progress = FALSE)