A Systematic Study of the Role of Data Quality and Alignment for Fine-tuning LLMs for Enhanced Autoformalization
Please cite us if you find our work useful:
@article{,
abstract = {This study explores the role of data quality, particularly alignment, in fine-tuning Large Language Models (LLMs) for the task of autoformalization. Contrary to the conventional emphasis on dataset size, our research highlights the importance of data alignment-the similarity between training data and target domain. Through our experiments, we demonstrate a negative correlation between data alignment and model perplexity loss. These findings suggest a re-evaluation of LLM training approaches, emphasizing quality and relevance over quantity, especially in specialized applications such as autoformalization.},
author = {Krrish Chawla and Mario Depavia and Aryan Sahai and Brando Miranda},
title = {A SYSTEMATIC STUDY OF THE ROLE OF DATA QUALITY AND ALIGNMENT FOR FINE-TUNING LLMS FOR ENHANCED AUTO-FORMALIZATION},
}
Old Research Journal