44---
55< section class ="section-margin ">
66 < div class ="container ">
7+ < h2 id ="publications "> 2026</ h2 >
8+ < ul class ="publications ">
9+ < li >
10+ < a target ="_blank " href ="/ "> OpGuard: Bitwise Alignment for Precise and General Debugging of Production LLM Training</ a > < br >
11+ < span class ="authorlist "> < i > < a href ="https://ziming-zh.github.io " class ="nodec "> Ziming Zhou</ a > , </ i > < i > Yinjie Zhao, </ i > < i > Hang Zhu, </ i > < i > Wenxiao Wang, </ i > < i > Zhihao Bai, </ i > < i > Yun Zhang, </ i > < i > Shuguang Wang, </ i > < i > Haibin Lin, </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
12+ < a target ="_blank " href ="https://www.usenix.org/conference/osdi26 " class ="conf "> < b > OSDI 2026</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="/ "> citation</ a >
13+ </ li >
14+
15+ </ ul >
716 < h2 id ="publications "> 2025</ h2 >
817 < ul class ="publications ">
918 < li >
@@ -26,7 +35,7 @@ <h2 id="publications">2025</h2>
2635 </ li >
2736 < li >
2837 < a target ="_blank " href ="/paper/traincheck-osdi25-preprint.pdf "> Training with Confidence: Catching Silent Errors in Deep Learning Training with Automated Proactive Checks</ a > < br >
29- < span class ="authorlist "> < i > < a href ="https://essoz.github.io " class ="nodec "> Yuxuan Jiang</ a > , </ i > < i > Ziming Zhou, </ i > < i > Boyu Xu, </ i > < i > Beijie Liu, </ i > < i > Runhui Xu, </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
38+ < span class ="authorlist "> < i > < a href ="https://essoz.github.io " class ="nodec "> Yuxuan Jiang</ a > , </ i > < i > < a href =" https://ziming-zh.github.io " class =" nodec " > Ziming Zhou</ a > , </ i > < i > Boyu Xu, </ i > < i > Beijie Liu, </ i > < i > Runhui Xu, </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
3039 < a target ="_blank " href ="https://www.usenix.org/conference/osdi25 " class ="conf "> < b > OSDI 2025</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="/paper/traincheck-osdi25.bib "> citation</ a >
3140 < a target ="_blank " role ="button " class ="btn btn-outline-primary publinkitem " href ="/slides/traincheck_osdi25_slides.pdf "> slides</ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="https://github.com/OrderLab/TrainCheck "> software</ a > < a target ="_blank " role ="button " class ="btn btn-outline-primary publinkitem " href ="https://www.arxiv.org/abs/2506.14813 "> arXiv</ a > < br > < div class ="press "> < b > Coverage:</ b > < a target ="_blank " href ="https://cse.engin.umich.edu/stories/improving-ai-models-automated-tool-detects-silent-errors-in-deep-learning-training "> CSE News</ a > , < a target ="_blank " href ="https://news.engin.umich.edu/2025/07/improving-ai-models-automated-tool-detects-silent-errors-in-deep-learning-training "> Michigan Engineering News</ a > , < a target ="_blank " href ="https://techxplore.com/news/2025-07-ai-automated-tool-silent-errors.html "> Tech Xplore</ a > </ div >
3241 </ li >
@@ -208,7 +217,7 @@ <h2 id="publications">2017</h2>
208217 < h2 id ="publications "> 2016</ h2 >
209218 < ul class ="publications ">
210219 < li >
211- < a target ="_blank " href ="http://opera.ucsd.edu// paper/osdi16-pcheck.pdf "> Early Detection of Configuration Errors to Reduce Failure Damage</ a > < b style ="color:green "> [Best Paper Award]</ b > < br >
220+ < a target ="_blank " href ="http://opera.ucsd.edu/paper/osdi16-pcheck.pdf "> Early Detection of Configuration Errors to Reduce Failure Damage</ a > < b style ="color:green "> [Best Paper Award]</ b > < br >
212221 < span class ="authorlist "> < i > < a href ="http://cseweb.ucsd.edu/~tixu " class ="nodec "> Tianyin Xu</ a > , </ i > < i > < a href ="http://cseweb.ucsd.edu/~x7jin " class ="nodec "> Xinxin Jin</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > , </ i > < i > < a href ="http://cseweb.ucsd.edu/~yyzhou " class ="nodec "> Yuanyuan Zhou</ a > , </ i > < i > < a href ="http://people.cs.uchicago.edu/~shanlu " class ="nodec "> Shan Lu</ a > , </ i > < i > Long Jin, </ i > < i > Shankar Pasupathy< br > </ i > </ span >
213222 < a target ="_blank " href ="https://www.usenix.org/conference/osdi16 " class ="conf "> < b > OSDI 2016</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="/paper/pcheck.bib "> citation</ a >
214223 </ li >
@@ -278,7 +287,7 @@ <h2 id="publications">2013</h2>
278287 < h2 id ="publications "> 2012</ h2 >
279288 < ul class ="publications ">
280289 < li >
281- < a target ="_blank " href ="http://opera.ucsd.edu// paper/osdi12-errlog.pdf "> Be Conservative: Enhancing Failure Diagnosis with Proactive Logging</ a > < br >
290+ < a target ="_blank " href ="http://opera.ucsd.edu/paper/osdi12-errlog.pdf "> Be Conservative: Enhancing Failure Diagnosis with Proactive Logging</ a > < br >
282291 < span class ="authorlist "> < i > < a href ="http://www.eecg.toronto.edu/~yuan " class ="nodec "> Ding Yuan</ a > , </ i > < i > Soyeon Park, </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > , </ i > < i > Yang Liu, </ i > < i > Michael M. Lee, </ i > < i > Xiaoming Tang, </ i > < i > < a href ="http://cseweb.ucsd.edu/~yyzhou " class ="nodec "> Yuanyuan Zhou</ a > , </ i > < i > < a href ="http://cseweb.ucsd.edu/~savage " class ="nodec "> Stefan Savage</ a > < br > </ i > </ span >
283292 < a target ="_blank " href ="http://www.usenix.org/events/osdi12 " class ="conf "> < b > OSDI 2012</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="/paper/errlog.bib "> citation</ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="http://opera.ucsd.edu/errlog.htm "> dataset</ a >
284293 </ li >
0 commit comments