FatimahEmadEldin's picture
Add training script for reproducibility
4757448 verified
# -*- coding: utf-8 -*-
"""Open-Track-Document-Bassline-Readability-Arabertv2-d3tok-reg.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1bwUyQ1WscI7jXo6arMRwn4NIswjaqK7Y
"""
import pandas as pd
import numpy as np
import os
import torch
import zipfile
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
EarlyStoppingCallback
)
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar
# --- Configuration ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
BASE_DIR = '.'
DATA_DIR = os.path.join(BASE_DIR, "data")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "results", f"regression_{MODEL_NAME.split('/')[-1]}")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(SUBMISSION_DIR, exist_ok=True)
# --- File Paths ---
BAREC_TRAIN_PATH = os.path.join(DATA_DIR, 'train_augmented.csv')
BAREC_DEV_PATH = os.path.join(DATA_DIR, 'dev_augmented.csv')
BLIND_TEST_PATH = os.path.join(DATA_DIR, 'blind_test_data.csv')
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_regression_big_data.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_regression_final_op4n.zip")
TRAIN_PREPROCESSED_PATH = os.path.join(DATA_DIR, 'train_augmented.csv')
DEV_PREPROCESSED_PATH = os.path.join(DATA_DIR, 'dev_augmented.csv')
# --- DATA LOADING AND PREPROCESSING ---
def preprocess_d3tok(text, disambiguator):
"""
Preprocesses text into the D3Tok format using BERTUnfactoredDisambiguator.
This version includes robust error handling for missing 'd3tok' keys.
"""
if not isinstance(text, str) or not text.strip():
return ""
tokens = simple_word_tokenize(text)
disambiguated_sentence = disambiguator.disambiguate(tokens)
d3tok_forms = []
for disambig_word in disambiguated_sentence:
if disambig_word.analyses:
analysis_dict = disambig_word.analyses[0][1]
# MODIFICATION: Safely check if the 'd3tok' key exists.
if 'd3tok' in analysis_dict:
d3tok = dediac_ar(analysis_dict['d3tok']).replace("_+", " +").replace("+_", "+ ")
d3tok_forms.append(d3tok)
else:
# Fallback for analyses that don't have a 'd3tok' key (e.g., punctuation)
d3tok_forms.append(disambig_word.word)
else:
# Fallback for words with no analysis at all
d3tok_forms.append(disambig_word.word)
return " ".join(d3tok_forms)
def load_or_preprocess_data(disambiguator):
"""
Loads preprocessed data if it exists, otherwise, it runs preprocessing.
"""
print("--- Loading BAREC Data ---")
if os.path.exists(TRAIN_PREPROCESSED_PATH) and os.path.exists(DEV_PREPROCESSED_PATH):
print("✔ Found preprocessed files. Loading them directly...")
train_df = pd.read_csv(TRAIN_PREPROCESSED_PATH)
val_df = pd.read_csv(DEV_PREPROCESSED_PATH)
train_df['text'] = train_df['text'].astype(str)
val_df['text'] = val_df['text'].astype(str)
print(f"Successfully loaded {len(train_df)} training and {len(val_df)} validation records.")
return train_df, val_df
else:
print("Preprocessed files not found. Starting one-time preprocessing...")
try:
train_df = pd.read_csv(BAREC_TRAIN_PATH)
val_df = pd.read_csv(BAREC_DEV_PATH)
train_df = train_df[['Sentence', 'Readability_Level_19']].rename(
columns={'Sentence': 'text', 'Readability_Level_19': 'label'})
val_df = val_df[['Sentence', 'Readability_Level_19']].rename(
columns={'Sentence': 'text', 'Readability_Level_19': 'label'})
train_df.dropna(subset=['text', 'label'], inplace=True)
val_df.dropna(subset=['label', 'text'], inplace=True)
train_df['text'] = train_df['text'].astype(str)
val_df['text'] = val_df['text'].astype(str)
train_df['label'] = train_df['label'].astype(int) - 1
val_df['label'] = val_df['label'].astype(int) - 1
train_df['label'] = train_df['label'].astype(float)
val_df['label'] = val_df['label'].astype(float)
print(f"Successfully loaded raw data: {len(train_df)} training and {len(val_df)} validation records.")
print("\n--- Preprocessing Text to D3Tok format (this will only run once) ---")
train_df['text'] = train_df['text'].apply(lambda x: preprocess_d3tok(x, disambiguator))
val_df['text'] = val_df['text'].apply(lambda x: preprocess_d3tok(x, disambiguator))
print("✔ Text preprocessing finished.")
print("\n--- Saving preprocessed data for future use... ---")
train_df.to_csv(TRAIN_PREPROCESSED_PATH, index=False)
val_df.to_csv(DEV_PREPROCESSED_PATH, index=False)
print(f"** Saved preprocessed files to {TRAIN_PREPROCESSED_PATH} and {DEV_PREPROCESSED_PATH} **")
return train_df, val_df
except FileNotFoundError:
print(f"! ERROR: Raw file not found. Make sure 'train.csv' and 'dev.csv' are in the '{DATA_DIR}' directory.")
return None, None
except Exception as e:
print(f"! ERROR during initial processing: {e}")
return None, None
print("Initializing BERT Disambiguator for preprocessing...")
bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')
train_df, val_df = load_or_preprocess_data(bert_disambiguator)
if train_df is not None:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
else:
print("Stopping script due to data loading failure.")
exit()
# --- DATASET AND METRICS ---
class ReadabilityDataset(TorchDataset):
def __init__(self, texts, labels=None):
self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
if self.labels is not None:
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __len__(self):
return len(self.encodings.get('input_ids', []))
def compute_metrics(p):
preds = p.predictions.flatten()
rounded_preds = np.round(preds)
clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1).astype(int)
labels = p.label_ids.astype(int)
qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic')
return {"qwk": qwk}
# --- MODEL TRAINING ---
print("\n===== INITIALIZING REGRESSION MODEL AND TRAINER =====\n")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
train_dataset = ReadabilityDataset(train_df['text'].tolist(), train_df['label'].tolist())
val_dataset = ReadabilityDataset(val_df['text'].tolist(), val_df['label'].tolist())
training_args = TrainingArguments(
output_dir=CHECKPOINT_DIR,
num_train_epochs=10,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
learning_rate=5e-5,
warmup_ratio=0.1,
weight_decay=0.01,
logging_steps=100,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="qwk",
greater_is_better=True,
save_total_limit=2,
fp16=torch.cuda.is_available(),
report_to="none"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)
print("Starting training...")
trainer.train()
print("✔ Training finished.")
# =====================================================================================
# 5. MODEL TRAINING
# =====================================================================================
print("\n===== INITIALIZING REGRESSION MODEL AND TRAINER =====\n")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
train_dataset = ReadabilityDataset(train_df['text'].tolist(), train_df['label'].tolist())
val_dataset = ReadabilityDataset(val_df['text'].tolist(), val_df['label'].tolist())
training_args = TrainingArguments(
output_dir=CHECKPOINT_DIR,
num_train_epochs=20,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
learning_rate=5e-5,
warmup_ratio=0.1,
weight_decay=0.01,
logging_steps=100,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="qwk",
greater_is_better=True,
save_total_limit=2,
fp16=torch.cuda.is_available(),
report_to="none"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
# --- MODIFICATION FOR RESUMING ---
# Check if a checkpoint exists in the output directory
if os.path.isdir(CHECKPOINT_DIR):
# Find the latest checkpoint directory (e.g., 'checkpoint-4044')
checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-40350")]
if checkpoints:
# Sort by step number to get the latest one
latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('-')[-1]))
latest_checkpoint_path = os.path.join(CHECKPOINT_DIR, latest_checkpoint)
print(f"Resuming training from checkpoint: {latest_checkpoint_path}")
trainer.train(resume_from_checkpoint=latest_checkpoint_path)
else:
# No checkpoints found, start training from scratch
print("No checkpoint found. Starting training from the beginning...")
trainer.train()
else:
# Output directory doesn't even exist, start fresh
print("No checkpoint directory found. Starting training from the beginning...")
trainer.train()
print("✔ Training finished.")
# --- FINAL PREDICTION AND SUBMISSION ---
print("\n===== FINAL PREDICTION AND SUBMISSION =====\n")
try:
test_df = pd.read_csv(BLIND_TEST_PATH)
test_df.dropna(subset=['Sentence'], inplace=True)
print("Preprocessing blind test text to D3Tok format...")
# This part is correct because bert_disambiguator was defined in the global scope
test_df['processed_text'] = test_df['Sentence'].apply(lambda x: preprocess_d3tok(x, bert_disambiguator))
print("Generating predictions on the test set...")
test_dataset = ReadabilityDataset(test_df['processed_text'].tolist())
predictions = trainer.predict(test_dataset)
raw_preds = predictions.predictions.flatten()
rounded_preds = np.round(raw_preds)
clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1)
test_df['Prediction'] = (clipped_preds + 1).astype(int)
# --- FIX: Use the 'ID' column and rename it to 'Sentence ID' ---
# The blind test CSV has a column 'ID', not 'Sentence ID'.
submission_df = test_df[['ID', 'Prediction']]
# Rename the column to match the required submission format.
submission_df = submission_df.rename(columns={'ID': 'Sentence ID'})
print(f"Saving prediction file to: {SUBMISSION_PATH}")
submission_df.to_csv(SUBMISSION_PATH, index=False)
print(f"\nCompressing {os.path.basename(SUBMISSION_PATH)} into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
print(f"✔ Submission file {os.path.basename(ZIPPED_SUBMISSION_PATH)} created successfully.")
except FileNotFoundError:
print(f"! ERROR: Test file not found. Make sure 'blind_test_data.csv' is in the '{DATA_DIR}' directory.")
except KeyError:
print("! KEY ERROR: Could not find the 'ID' column in the test data. Please check the blind_test_data.csv file.")
except Exception as e:
print(f"An error occurred during final prediction: {e}")
print("\n--- Script Finished ---")