@inproceedings{TrainCheckOSDI2025, author = {Jiang, Yuxuan and Zhou, Ziming and Xu, Boyu and Liu, Beijie and Xu, Runhui and Huang, Peng}, title = {Training with Confidence: Catching Silent Errors in Deep Learning Training with Automated Proactive Checks}, booktitle = {Proceedings of the 19th USENIX Symposium on Operating Systems Design and Implementation}, series = {OSDI '25}, month = {July}, year = {2025}, address = {Boston, MA, USA}, publisher = {USENIX Association}, }