Here're some resources about Alignment Fine-Tuning strategies on LLMs, especically instruction-following tuning (IFT)
tag: Kimi k1.5
| Moonshot
paper link: here
code link: here
citation
@article{MoonshotAI,
author = {Kimi Team},
title = {Kimi k1.5: Scaling Reinforcement Learning with LLMs},
year = {2025},
}
tag: DeepSeek-R1
| DeepSeek
paper link: here
code link: here
citation:
@misc{deepseekai2025deepseekr1incentivizingreasoningcapability,
title={DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning},
author={DeepSeek-AI and Daya Guo and Dejian Yang and Haowei Zhang and Junxiao Song and Ruoyu Zhang and Runxin Xu and Qihao Zhu and Shirong Ma and Peiyi Wang and Xiao Bi and Xiaokang Zhang and Xingkai Yu and Yu Wu and Z. F. Wu and Zhibin Gou and Zhihong Shao and Zhuoshu Li and Ziyi Gao and Aixin Liu and Bing Xue and Bingxuan Wang and Bochao Wu and Bei Feng and Chengda Lu and Chenggang Zhao and Chengqi Deng and Chenyu Zhang and Chong Ruan and Damai Dai and Deli Chen and Dongjie Ji and Erhang Li and Fangyun Lin and Fucong Dai and Fuli Luo and Guangbo Hao and Guanting Chen and Guowei Li and H. Zhang and Han Bao and Hanwei Xu and Haocheng Wang and Honghui Ding and Huajian Xin and Huazuo Gao and Hui Qu and Hui Li and Jianzhong Guo and Jiashi Li and Jiawei Wang and Jingchang Chen and Jingyang Yuan and Junjie Qiu and Junlong Li and J. L. Cai and Jiaqi Ni and Jian Liang and Jin Chen and Kai Dong and Kai Hu and Kaige Gao and Kang Guan and Kexin Huang and Kuai Yu and Lean Wang and Lecong Zhang and Liang Zhao and Litong Wang and Liyue Zhang and Lei Xu and Leyi Xia and Mingchuan Zhang and Minghua Zhang and Minghui Tang and Meng Li and Miaojun Wang and Mingming Li and Ning Tian and Panpan Huang and Peng Zhang and Qiancheng Wang and Qinyu Chen and Qiushi Du and Ruiqi Ge and Ruisong Zhang and Ruizhe Pan and Runji Wang and R. J. Chen and R. L. Jin and Ruyi Chen and Shanghao Lu and Shangyan Zhou and Shanhuang Chen and Shengfeng Ye and Shiyu Wang and Shuiping Yu and Shunfeng Zhou and Shuting Pan and S. S. Li and Shuang Zhou and Shaoqing Wu and Shengfeng Ye and Tao Yun and Tian Pei and Tianyu Sun and T. Wang and Wangding Zeng and Wanjia Zhao and Wen Liu and Wenfeng Liang and Wenjun Gao and Wenqin Yu and Wentao Zhang and W. L. Xiao and Wei An and Xiaodong Liu and Xiaohan Wang and Xiaokang Chen and Xiaotao Nie and Xin Cheng and Xin Liu and Xin Xie and Xingchao Liu and Xinyu Yang and Xinyuan Li and Xuecheng Su and Xuheng Lin and X. Q. Li and Xiangyue Jin and Xiaojin Shen and Xiaosha Chen and Xiaowen Sun and Xiaoxiang Wang and Xinnan Song and Xinyi Zhou and Xianzu Wang and Xinxia Shan and Y. K. Li and Y. Q. Wang and Y. X. Wei and Yang Zhang and Yanhong Xu and Yao Li and Yao Zhao and Yaofeng Sun and Yaohui Wang and Yi Yu and Yichao Zhang and Yifan Shi and Yiliang Xiong and Ying He and Yishi Piao and Yisong Wang and Yixuan Tan and Yiyang Ma and Yiyuan Liu and Yongqiang Guo and Yuan Ou and Yuduan Wang and Yue Gong and Yuheng Zou and Yujia He and Yunfan Xiong and Yuxiang Luo and Yuxiang You and Yuxuan Liu and Yuyang Zhou and Y. X. Zhu and Yanhong Xu and Yanping Huang and Yaohui Li and Yi Zheng and Yuchen Zhu and Yunxian Ma and Ying Tang and Yukun Zha and Yuting Yan and Z. Z. Ren and Zehui Ren and Zhangli Sha and Zhe Fu and Zhean Xu and Zhenda Xie and Zhengyan Zhang and Zhewen Hao and Zhicheng Ma and Zhigang Yan and Zhiyu Wu and Zihui Gu and Zijia Zhu and Zijun Liu and Zilin Li and Ziwei Xie and Ziyang Song and Zizheng Pan and Zhen Huang and Zhipeng Xu and Zhongyu Zhang and Zhen Zhang},
year={2025},
eprint={2501.12948},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.12948},
}
tag: REINFORCE++
| OpenRLHF
| ICML23
| UC Berkeley
paper link: here
code link: here
citation:
@misc{hu2025reinforcesimpleefficientapproach,
title={REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models},
author={Jian Hu},
year={2025},
eprint={2501.03262},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.03262},
}
tag: TULU 3
| RLVR
| Allen AI
paper link: here
blog link: here
code link: here
modelhub link: here
citation:
@misc{lambert2024tulu3pushingfrontiers,
title={T\"ULU 3: Pushing Frontiers in Open Language Model Post-Training},
author={Nathan Lambert and Jacob Morrison and Valentina Pyatkin and Shengyi Huang and Hamish Ivison and Faeze Brahman and Lester James V. Miranda and Alisa Liu and Nouha Dziri and Shane Lyu and Yuling Gu and Saumya Malik and Victoria Graf and Jena D. Hwang and Jiangjiang Yang and Ronan Le Bras and Oyvind Tafjord and Chris Wilhelm and Luca Soldaini and Noah A. Smith and Yizhong Wang and Pradeep Dasigi and Hannaneh Hajishirzi},
year={2024},
eprint={2411.15124},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2411.15124},
}
tag: DQO
| ByteDance
paper link: here
citation:
@article{liu2024enhancing,
title={Enhancing Multi-Step Reasoning Abilities of Language Models through Direct Q-Function Optimization},
author={Liu, Guanlin and Ji, Kaixuan and Zheng, Renjie and Wu, Zheng and Dun, Chen and Gu, Quanquan and Yan, Lin},
journal={arXiv preprint arXiv:2410.09302},
year={2024}
}
tag: HybridFlow
| EuroSys25
| ByteDance
paper link: here
code link: here
citation:
@article{sheng2024hybridflow,
title = {HybridFlow: A Flexible and Efficient RLHF Framework},
author = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
year = {2024},
journal = {arXiv preprint arXiv: 2409.19256}
}
tag: ReaLHF
| RLHF
| Tsinghua University
paper link: here
code link: here
citation:
@misc{mei2024realhfoptimizedrlhftraining,
title={ReaLHF: Optimized RLHF Training for Large Language Models through Parameter Reallocation},
author={Zhiyu Mei and Wei Fu and Kaiwei Li and Guangju Wang and Huanchen Zhang and Yi Wu},
year={2024},
eprint={2406.14088},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2406.14088},
}
tag: CPO
| NIPS24
| Sea AI Lab
paper link: here
code link: here
citation:
@misc{zhang2024chainpreferenceoptimizationimproving,
title={Chain of Preference Optimization: Improving Chain-of-Thought Reasoning in LLMs},
author={Xuan Zhang and Chao Du and Tianyu Pang and Qian Liu and Wei Gao and Min Lin},
year={2024},
eprint={2406.09136},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2406.09136},
}
tag: RLAIF-V
| Tsinghua University
paper link: here
code link: here
citation:
@misc{yu2024rlaifvaligningmllmsopensource,
title={RLAIF-V: Aligning MLLMs through Open-Source AI Feedback for Super GPT-4V Trustworthiness},
author={Tianyu Yu and Haoye Zhang and Yuan Yao and Yunkai Dang and Da Chen and Xiaoman Lu and Ganqu Cui and Taiwen He and Zhiyuan Liu and Tat-Seng Chua and Maosong Sun},
year={2024},
eprint={2405.17220},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.17220},
}
tag: RLOO
| ACL24
| Cohere
paper link: here
citation:
@misc{ahmadian2024basicsrevisitingreinforcestyle,
title={Back to Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs},
author={Arash Ahmadian and Chris Cremer and Matthias Gallé and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet Üstün and Sara Hooker},
year={2024},
eprint={2402.14740},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2402.14740},
}
tag: OAIF
| DAP
| Google DeepMind
paper link: here
citation:
@misc{guo2024directlanguagemodelalignment,
title={Direct Language Model Alignment from Online AI Feedback},
author={Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Rame and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel},
year={2024},
eprint={2402.04792},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2402.04792},
}
tag: DeepSeekMath
| GRPO
| DeepSeek
| Tsinghua University
| Peking University
paper link: here
code link: here
citation:
@misc{shao2024deepseekmath,
title={DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
author={Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
year={2024},
eprint={2402.03300},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: SPIN
| Self-Play
| ICML24
| UCLA
paper link: here
code link: here
citation:
@misc{chen2024selfplay,
title={Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models},
author={Zixiang Chen and Yihe Deng and Huizhuo Yuan and Kaixuan Ji and Quanquan Gu},
year={2024},
eprint={2401.01335},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: Zephyr
| dDPO
| COLM24
| HuggingFace
paper link: here
code link: here
modelhub link: here
citation:
@misc{tunstall2023zephyr,
title={Zephyr: Direct Distillation of LM Alignment},
author={Lewis Tunstall and Edward Beeching and Nathan Lambert and Nazneen Rajani and Kashif Rasul and Younes Belkada and Shengyi Huang and Leandro von Werra and Clémentine Fourrier and Nathan Habib and Nathan Sarrazin and Omar Sanseviero and Alexander M. Rush and Thomas Wolf},
year={2023},
eprint={2310.16944},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: FD-Align
| NIPS23
| USTB
paper link: here
code link: here
citation:
@article{song2023fd,
title={FD-Align: Feature Discrimination Alignment for Fine-tuning Pre-Trained Models in Few-Shot Learning},
author={Song, Kun and Ma, Huimin and Zou, Bochao and Zhang, Huishuai and Huang, Weiran},
journal={arXiv preprint arXiv:2310.15105},
year={2023}
}
tag: GPO
| ICLR24
| UCLA
paper link: here
code link: here
homepage link: here
citation:
@misc{zhao2023group,
title={Group Preference Optimization: Few-Shot Alignment of Large Language Models},
author={Siyan Zhao and John Dang and Aditya Grover},
year={2023},
eprint={2310.11523},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: P3O
| UC Berkeley
paper link: here
citation:
@misc{wu2023pairwise,
title={Pairwise Proximal Policy Optimization: Harnessing Relative Feedback for LLM Alignment},
author={Tianhao Wu and Banghua Zhu and Ruoyu Zhang and Zhaojin Wen and Kannan Ramchandran and Jiantao Jiao},
year={2023},
eprint={2310.00212},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: OpenChat
| C-RLFT
| ICLR24
| Tsinghua University
paper link: here
code link: here
modelhub link: here
citation:
@article{wang2023openchat,
title={Openchat: Advancing open-source language models with mixed-quality data},
author={Wang, Guan and Cheng, Sijie and Zhan, Xianyuan and Li, Xiangang and Song, Sen and Liu, Yang},
journal={arXiv preprint arXiv:2309.11235},
year={2023}
}
tag: ReST
| RLHF
| Google DeepMind
paper link: here
citation:
@misc{gulcehre2023reinforcedselftrainingrestlanguage,
title={Reinforced Self-Training (ReST) for Language Modeling},
author={Caglar Gulcehre and Tom Le Paine and Srivatsan Srinivasan and Ksenia Konyushkova and Lotte Weerts and Abhishek Sharma and Aditya Siddhant and Alex Ahern and Miaosen Wang and Chenjie Gu and Wolfgang Macherey and Arnaud Doucet and Orhan Firat and Nando de Freitas},
year={2023},
eprint={2308.08998},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2308.08998},
}
tag: APA
| UC Berkeley
paper link: here
citation:
@misc{zhu2023finetuning,
title={Fine-Tuning Language Models with Advantage-Induced Policy Alignment},
author={Banghua Zhu and Hiteshi Sharma and Felipe Vieira Frujeri and Shi Dong and Chenguang Zhu and Michael I. Jordan and Jiantao Jiao},
year={2023},
eprint={2306.02231},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: DPO
| NIPS23
| Stanford University
paper link: here
citation:
@article{rafailov2023direct,
title={Direct preference optimization: Your language model is secretly a reward model},
author={Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D and Finn, Chelsea},
journal={arXiv preprint arXiv:2305.18290},
year={2023}
}
tag: LIMA
| NIPS23
| Meta
| CMU
paper link: here
citation:
@article{zhou2023lima,
title={Lima: Less is more for alignment},
author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srini and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
journal={arXiv preprint arXiv:2305.11206},
year={2023}
}
tag: Self-Align
| NIPS23
| IBM Research
| CMU
| MIT
paper link: here
code link: here
citation:
@article{sun2023principle,
title={Principle-driven self-alignment of language models from scratch with minimal human supervision},
author={Sun, Zhiqing and Shen, Yikang and Zhou, Qinhong and Zhang, Hongxin and Chen, Zhenfang and Cox, David and Yang, Yiming and Gan, Chuang},
journal={arXiv preprint arXiv:2305.03047},
year={2023}
}
tag: WizardLM
| ICLR24
| Microsoft
| Peking University
paper link: here
code link: here
modelhub link: here
citation:
@misc{xu2023wizardlm,
title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},
author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},
year={2023},
eprint={2304.12244},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: RAFT
| TMLR23
| HKU
paper link: here
code link: here
citation:
@misc{dong2023raftrewardrankedfinetuning,
title={RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment},
author={Hanze Dong and Wei Xiong and Deepanshu Goyal and Yihan Zhang and Winnie Chow and Rui Pan and Shizhe Diao and Jipeng Zhang and Kashun Shum and Tong Zhang},
year={2023},
eprint={2304.06767},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2304.06767},
}
tag: RRHF
| NIPS23
| DAMO Academy
| Alibaba Group
| Tsinghua University
paper link: here
code link: here
citation:
@article{yuan2023rrhf,
title={Rrhf: Rank responses to align language models with human feedback without tears},
author={Yuan, Zheng and Yuan, Hongyi and Tan, Chuanqi and Wang, Wei and Huang, Songfang and Huang, Fei},
journal={arXiv preprint arXiv:2304.05302},
year={2023}
}
tag: OpenAGI
| RLTF
| NIPS23
| Rutgers University
paper link: here
code link: here
citation:
@article{ge2023openagi,
title={Openagi: When llm meets domain experts},
author={Ge, Yingqiang and Hua, Wenyue and Ji, Jianchao and Tan, Juntao and Xu, Shuyuan and Zhang, Yongfeng},
journal={arXiv preprint arXiv:2304.04370},
year={2023}
}
tag: K-wise Comparison
| ICML23
| UC Berkeley
paper link: here
citation:
@misc{zhu2023principled,
title={Principled Reinforcement Learning with Human Feedback from Pairwise or $K$-wise Comparisons},
author={Banghua Zhu and Jiantao Jiao and Michael I. Jordan},
year={2023},
eprint={2301.11270},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: Self-instruct
| IFT
| Instruction Tuning
| ACL23
| University of Washington
paper link: here
code link: here
citation:
@article{wang2022self,
title={Self-instruct: Aligning language model with self generated instructions},
author={Wang, Yizhong and Kordi, Yeganeh and Mishra, Swaroop and Liu, Alisa and Smith, Noah A and Khashabi, Daniel and Hajishirzi, Hannaneh},
journal={arXiv preprint arXiv:2212.10560},
year={2022}
}
tag: Self-Prompting
| IFT
| Instruction Tuning
| NAACL24
| SJTU
paper link: here
code link: here
citation:
@misc{li2023selfprompting,
title={Self-Prompting Large Language Models for Zero-Shot Open-Domain QA},
author={Junlong Li and Zhuosheng Zhang and Hai Zhao},
year={2023},
eprint={2212.08635},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: RLAIF
| Constitutional AI
| Anthropic
paper link: here
code link: here
citation:
@article{bai2022constitutional,
title={Constitutional ai: Harmlessness from ai feedback},
author={Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and others},
journal={arXiv preprint arXiv:2212.08073},
year={2022}
}
tag: InstructGPT
| IFT
| Instruction Tuning
| NIPS22
| OpenAI
paper link: here
citation:
@article{ouyang2022training,
title={Training language models to follow instructions with human feedback},
author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
journal={Advances in Neural Information Processing Systems},
volume={35},
pages={27730--27744},
year={2022}
}
tag: ILQL
| ICLR23
| UC Berkeley
paper link: here
code link: here
homepage link: here
citation:
@misc{snell2023offlinerlnaturallanguage,
title={Offline RL for Natural Language Generation with Implicit Language Q Learning},
author={Charlie Snell and Ilya Kostrikov and Yi Su and Mengjiao Yang and Sergey Levine},
year={2023},
eprint={2206.11871},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2206.11871},
}
tag: FLAN-T5
| FLAN-PaLM
| IFT
| Instruction Tuning
| JMLR24
| Google
paper link: here
code link: here
citation:
@misc{chung2022scaling,
title={Scaling Instruction-Finetuned Language Models},
author={Hyung Won Chung and Le Hou and Shayne Longpre and Barret Zoph and Yi Tay and William Fedus and Yunxuan Li and Xuezhi Wang and Mostafa Dehghani and Siddhartha Brahma and Albert Webson and Shixiang Shane Gu and Zhuyun Dai and Mirac Suzgun and Xinyun Chen and Aakanksha Chowdhery and Alex Castro-Ros and Marie Pellat and Kevin Robinson and Dasha Valter and Sharan Narang and Gaurav Mishra and Adams Yu and Vincent Zhao and Yanping Huang and Andrew Dai and Hongkun Yu and Slav Petrov and Ed H. Chi and Jeff Dean and Jacob Devlin and Adam Roberts and Denny Zhou and Quoc V. Le and Jason Wei},
year={2022},
eprint={2210.11416},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: FLAN
| IFT
| Instruction Tuning
| ICLR22
| Google
paper link: here
code link: here
citation:
@article{wei2021finetuned,
title={Finetuned language models are zero-shot learners},
author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
journal={arXiv preprint arXiv:2109.01652},
year={2021}
}
tag: RLHF
| PPO
| OpenAI
paper link: here
blog link: here
code link: here
citation:
@article{ziegler2019fine,
title={Fine-tuning language models from human preferences},
author={Ziegler, Daniel M and Stiennon, Nisan and Wu, Jeffrey and Brown, Tom B and Radford, Alec and Amodei, Dario and Christiano, Paul and Irving, Geoffrey},
journal={arXiv preprint arXiv:1909.08593},
year={2019}
}
tag: DPO
| PPO
| ICML24
| Tsinghua University
paper link: here
code link: here
citation:
@misc{xu2024dposuperiorppollm,
title={Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study},
author={Shusheng Xu and Wei Fu and Jiaxuan Gao and Wenjie Ye and Weilin Liu and Zhiyu Mei and Guangju Wang and Chao Yu and Yi Wu},
year={2024},
eprint={2404.10719},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.10719},
}
tag: RLHF
| Online Alignment
| Offline Alignment
| Google DeepMind
paper link: here
citation:
@misc{tang2024understandingperformancegaponline,
title={Understanding the performance gap between online and offline alignment algorithms},
author={Yunhao Tang and Daniel Zhaohan Guo and Zeyu Zheng and Daniele Calandriello and Yuan Cao and Eugene Tarassov and Rémi Munos and Bernardo Ávila Pires and Michal Valko and Yong Cheng and Will Dabney},
year={2024},
eprint={2405.08448},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2405.08448},
}