Minds and Machines — AI for Mental Health Support, Fine-Tuning LLMs with LoRA in Practice | by Volker Janz

Hackathon Guide: Building Your First Mental Health Chatbot

Press enter or click to view image in full size

I therapy session, generated with DALL·E 3
Press enter or click to view image in full size

Vertex AI Workbench instance for fine-tuning, source: by author
Press enter or click to view image in full size

Google Colab, source: https://colab.research.google.com/
Press enter or click to view image in full size

Google Colab pricing, source: https://colab.research.google.com/

Parameter-Efficient Fine-Tuning (PEFT) of Llama 2 with Mental Health Counseling Data

pip install torch torchvision datasets transformers tokenizers bitsandbytes peft accelerate trl
pip install flash-attn
import gc
import torch

from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer

# see: https://huggingface.co/docs/hub/security-tokens
# must be write token to push model later
hf_token = "your-token"

# https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
base_model = "meta-llama/Llama-2-7b-chat-hf"

# https://huggingface.co/datasets/Amod/mental_health_counseling_conversations
fine_tuning_dataset = "Amod/mental_health_counseling_conversations"

# name for output model
target_model = "vojay/Llama-2-7b-chat-hf-mental-health"

[INST] >
{{ system_prompt }}
>
{{ user_message }} [/INST] {{ model_response }}
def get_base_prompt():
return """
You are a knowledgeable and supportive psychologist. You provide emphatic, non-judgmental responses to users seeking
emotional and psychological support. Provide a safe space for users to share and reflect, focus on empathy, active
listening and understanding.
"""
def format_prompt(base, context, response):
return f"[INST] >{base}>{context} [/INST] {response} "
def train_mental_health_model():
model = AutoModelForCausalLM.from_pretrained(
base_model,
token=hf_token,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=False
),
torch_dtype=torch.float16, # reduce memory usage
attn_implementation="flash_attention_2" # optimize for tensor cores (NVIDIA A100)
)

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=8,
bias="none",
task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

args = TrainingArguments(
output_dir=target_model, # model output directory
overwrite_output_dir=True, # overwrite output if exists
num_train_epochs=2, # number of epochs to train
per_device_train_batch_size=2, # batch size per device during training
gradient_checkpointing=True, # save memory but causes slower training
logging_steps=10, # log every 10 steps
learning_rate=1e-4, # learning rate
max_grad_norm=0.3, # max gradient norm based on QLoRA paper
warmup_ratio=0.03, # warmup ratio based on QLoRA paper
optim="paged_adamw_8bit", # memory-efficient variant of AdamW optimizer
lr_scheduler_type="constant", # constant learning rate
save_strategy="epoch", # save at the end of each epoch
evaluation_strategy="epoch", # evaluation at the end of each epoch,
fp16=True, # use fp16 16-bitprecision training instead of 32-bit to save memory
tf32=True # optimize for tensor cores (NVIDIA A100)
)

tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# limit samples to reduce memory usage
dataset = load_dataset(fine_tuning_dataset, split="train")
train_dataset = dataset.select(range(2000))
eval_dataset = dataset.select(range(2000, 2500))

trainer = SFTTrainer(
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
peft_config=peft_config,
max_seq_length=1024,
tokenizer=tokenizer,
formatting_func=lambda entry: format_prompt(get_base_prompt(), entry["Context"], entry["Response"]),
packing=True,
args=args
)

gc.collect()
torch.cuda.empty_cache()

trainer.train()
trainer.save_model()
trainer.push_to_hub(target_model, token=hf_token)

Press enter or click to view image in full size

Notebook in Vertex AI Workbench instance, source: by author
output_dir=target_model,  # model output directory
overwrite_output_dir=True, # overwrite output if exists
num_train_epochs=2, # number of epochs to train
per_device_train_batch_size=2, # batch size per device during training
gradient_checkpointing=True, # save memory but causes slower training
logging_steps=10, # log every 10 steps
learning_rate=1e-4, # learning rate
max_grad_norm=0.3, # max gradient norm based on QLoRA paper
warmup_ratio=0.03, # warmup ratio based on QLoRA paper
optim="paged_adamw_8bit", # memory-efficient variant of AdamW optimizer
lr_scheduler_type="constant", # constant learning rate
save_strategy="epoch", # save at the end of each epoch
evaluation_strategy="epoch", # evaluation at the end of each epoch,
fp16=True, # use fp16 16-bitprecision training instead of 32-bit to save memory
tf32=True # optimize for tensor cores (NVIDIA A100)
train_mental_health_model()
Press enter or click to view image in full size

Fine-tuning process, source: by author
Press enter or click to view image in full size

Fine-tuned model on Hugging Face, source: by author
model_id = "meta-llama/Llama-2-7b-chat-hf"
adapter_model_id = "vojay/Llama-2-7b-chat-hf-mental-health"

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
model.load_adapter(adapter_model_id)

Create a Chatbot with the Fine-tuned Model

poetry config virtualenvs.in-project true
poetry new mental-health-bot
cd mental-health-bot

poetry add huggingface_hub
poetry add adapters
poetry add transformers
poetry add adapters
poetry add peft
poetry add torch

device = torch.device("mps")
torch.set_default_device(device)
import torch
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer

device = torch.device("mps")
torch.set_default_device(device)

login(token="your-token")

title = "Mental Health Chatbot"
description = "This bot is using a fine-tuned version of meta-llama/Llama-2-7b-chat-hf"

model_id = "meta-llama/Llama-2-7b-chat-hf"
adapter_model_id = "vojay/Llama-2-7b-chat-hf-mental-health"

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
model.load_adapter(adapter_model_id)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def get_base_prompt():
return """
You are a knowledgeable and supportive psychologist. You provide emphatic, non-judgmental responses to users seeking
emotional and psychological support. Provide a safe space for users to share and reflect, focus on empathy, active
listening and understanding.
"""

def format_prompt(base, user_message):
return f"[INST] >{base}>{user_message} [/INST]"

def chat_with_llama(prompt):
input_ids = tokenizer.encode(format_prompt(get_base_prompt(), prompt), return_tensors="pt")
input_ids = input_ids.to(device)
output = model.generate(
input_ids,
pad_token_id=tokenizer.eos_token_id,
max_length=2000,
temperature=0.9,
top_k=50,
top_p=0.9
)

decoded = tokenizer.decode(output[0], skip_special_tokens=True)
return decoded.split("[/INST]")[1].lstrip()

while True:
prompt = input("You: ")
response = chat_with_llama(prompt)
print(f"Llama: {response}")

I'm going through some things with my feelings and myself. I barely sleep and I've been struggling with anxiety, and stress. Can you recommend any coping strategies to avoid medication?

Leave a Reply

Your email address will not be published. Required fields are marked *