SymbioticLab.github.io/source/_data/SymbioticLab.bib at 9ee8fd2f3056d8ebff967abee345117344409405 · SymbioticLab/SymbioticLab.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Encoding: UTF-8
@InProceedings{hug:nsdi16,
  Title                    = {{HUG}: Multi-Resource Fairness for Correlated and Elastic Demands},
  Author                   = {Mosharaf Chowdhury and Zhenhua Liu and Ali Ghodsi and Ion Stoica},
  Booktitle                = {USENIX NSDI},
  Year                     = {2016},
  Month                    = {March},

  publist_confkey = {NSDI'16},
  publist_link = {paper || hug-nsdi16.pdf},
  publist_topic = {Datacenter Networking},
  Abstract                 = {In this paper, we study how to optimally provide isolation guarantees in multi-resource environments, such as public clouds, where a tenant's demands on different resources (links) are correlated. Unlike prior work such as Dominant Resource Fairness (DRF) that assumes static and fixed demands, we consider elastic demands. Our approach generalizes canonical max-min fairness to the multi-resource setting with correlated demands, and extends DRF to elastic demands. We consider two natural optimization objectives: isolation guarantee from a tenant's viewpoint and system utilization (work conservation) from an operator's perspective. We prove that in non-cooperative environments like public cloud networks, there is a strong tradeoff between optimal isolation guarantee and work conservation when demands are elastic. Even worse, work conservation can even decrease network utilization instead of improving it when demands are inelastic. We identify the root cause behind the tradeoff and present a provably optimal allocation algorithm, High Utilization with Guarantees (HUG), to achieve maximum attainable network utilization without sacrificing the optimal isolation guarantee, strategy-proofness, and other useful properties of DRF. In cooperative environments like private datacenter networks, HUG achieves both the optimal isolation guarantee and work conservation. Analyses, simulations, and experiments show that HUG provides better isolation guarantees, higher system utilization, and better tenant-level performance than its counterparts.}
}

@InProceedings{carbyne:osdi16,
  Title                    = {Altruistic Scheduling in Multi-Resource Clusters},
  Author                   = {Robert Grandl and Mosharaf Chowdhury and Aditya Akella and Ganesh Ananthanarayanan},
  Booktitle                = {USENIX OSDI},
  Year                     = {2016},
  Month                    = {October},

  publist_confkey = {OSDI'16},
  publist_link = {paper || carbyne-osdi16.pdf},
  publist_topic = {Big Data Systems},
  Abstract                 = {Given the well-known tradeoffs between fairness, performance, and efficiency, modern cluster schedulers often prefer instantaneous fairness as their primary objective to ensure performance isolation between users and groups. However, instantaneous, short-term convergence to fairness often does not result in noticeable long-term benefits. Instead, we propose an altruistic, long-term approach, Carbyne, where jobs yield fractions of their allocated resources without impacting their own completion times. We show that leftover resources collected via altruisms of many jobs can then be rescheduled to further secondary goals such as application-level performance and cluster efficiency without impacting performance isolation. Deployments and large-scale simulations show that Carbyne closely approximates the state-of-the-art solutions (e.g., DRF) in terms of performance isolation, while providing 1.26X better efficiency and 1.59X lower average job completion time.}
}

@Article{infiniswap:login17,
  Title                    = {Decentralized Memory Disaggregation Over Low-Latency Networks},
  Author                   = {Juncheng Gu and Youngmoon Lee and Yiwen Zhang and Mosharaf Chowdhury and Kang G. Shin},
  Journal                  = {USENIX ;login:},
  Year                     = {2017},

  Month                    = {December},
  Number                   = {4},
  Pages                    = {42--48},
  Volume                   = {42},
  publist_confkey = {USENIX ;login: Winter 2017},
  publist_link = {paper || infiniswap-login17.pdf},
  publist_topic = {Disaggregation},
  Abstract                 = {Memory disaggregation can expose remote memory across a cluster to local applications. However, existing proposals call for new architectures and/or new programming models, making them infeasible. We have developed a practical memory disaggregation solution, Infiniswap, which is a remote memory paging system for clusters with lowlatency, kernel-bypass networks such as RDMA. Infiniswap opportunistically harvests and transparently exposes unused memory across the cluster to unmodified applications by dividing the swap space of each machine into many chunks and distributing them to unused memory of many remote machines. For scalability, it leverages the power of many choices to perform decentralized memory chunk placements and evictions. Applications using Infiniswap receive large performance boosts when their working sets are larger than their physical memory allocations.}
}

@InProceedings{infiniswap:nsdi17,
  Title                    = {Efficient Memory Disaggregation with {Infiniswap}},
  Author                   = {Juncheng Gu and Youngmoon Lee and Yiwen Zhang and Mosharaf Chowdhury and Kang G. Shin},
  Booktitle                = {USENIX NSDI},
  Year                     = {2017},
  Month                    = {March},

  publist_confkey = {NSDI'17},
  publist_link = {paper || infiniswap-nsdi17.pdf},
  publist_link = {slides || infiniswap-nsdi17-slides.pdf},
  publist_link = {code || https://github.com/SymbioticLab/Infiniswap},
  publist_link = {website || https://infiniswap.github.io/},
  publist_link = {media || https://blog.acolyer.org/2017/05/05/efficient-memory-disaggregation-with-infiniswap/},
  publist_topic = {Disaggregation},
  publist_abstract = {
  Memory-intensive applications suffer large performance loss when their working sets do not fully fit in memory. Yet, they cannot leverage otherwise unused remote memory when paging out to disks even in the presence of large imbalance in memory utilizations across a cluster. Existing proposals for memory disaggregation call for new architectures, new hardware designs, and/or new programming models, making them infeasible.
  This paper describes the design and implementation of Infiniswap, a remote memory paging system designed specifically for an RDMA network. Infiniswap opportunistically harvests and transparently exposes unused memory to unmodified applications by dividing the swap space of each machine into many slabs and distributing them across many machines' remote memory. Because one-sided RDMA operations bypass remote CPUs, Infiniswap leverages the power of many choices to perform decentralized slab placements and evictions.
  We have implemented and deployed Infiniswap on an RDMA cluster without any modifications to user applications or the OS and evaluated its effectiveness using multiple workloads running on unmodified VoltDB, Memcached, PowerGraph, GraphX, and Apache Spark. Using Infiniswap, throughputs of these applications improve between 4X (0.94X) to 15.4X (7.8X) over disk (Mellanox nbdX), and median and tail latencies between 5.4X (2X) and 61X (2.3X). Infiniswap achieves these with negligible remote CPU usage, whereas nbdX becomes CPU-bound. Infiniswap increases the overall memory utilization of a cluster and works well at scale.}
  }

@TechReport{cellscope:tr16,
  Title                    = {Fast and Accurate Performance Analysis of {LTE} Radio Access Networks},
  Author                   = {Anand Padmanabha Iyer and Ion Stoica and Mosharaf Chowdhury and Li Erran Li},
  Institution              = {CoRR},
  Year                     = {2016},
  Month                    = {May},
  Number                   = {abs/1605.04652},

  publist_confkey = {arXiv:1605.04652},
  publist_link = {paper || https://arxiv.org/abs/1605.04652},
  publist_topic = {Wide-Area Computing},
  Abstract                 = {An increasing amount of analytics is performed on data that is procured in a real-time fashion to make real-time decisions. Such tasks include simple reporting on streams to sophisticated model building. However, the practicality of such analyses are impeded in several domains because they are faced with a fundamental trade-off between data collection latency and analysis accuracy.

In this paper, we study this trade-off in the context of a specific domain, Cellular Radio Access Networks (RAN). Our choice of this domain is influenced by its commonalities with several other domains that produce real-time data, our access to a large live dataset, and their real-time nature and dimensionality which makes it a natural fit for a popular analysis technique, machine learning (ML). We find that the latency accuracy trade-off can be resolved using two broad, general techniques: intelligent data grouping and task formulations that leverage domain characteristics. Based on this, we present CellScope, a system that addresses this challenge by applying a domain specific formulation and application of Multi-task Learning (MTL) to RAN performance analysis. It achieves this goal using three techniques: feature engineering to transform raw data into effective features, a PCA inspired similarity metric to group data from geographically nearby base stations sharing performance commonalities, and a hybrid online-offline model for efficient model updates. Our evaluation of CellScope shows that its accuracy improvements over direct application of ML range from 2.5x to 4.4x while reducing the model update overhead by up to 4.8x. We have also used CellScope to analyze a live LTE consisting of over 2 million subscribers for a period of over 10 months, where it uncovered several problems and insights, some of them previously unknown.}
}

@InProceedings{deepstack:hotos17,
  Title                    = {No! Not Another Deep Learning Framework},
  Author                   = {Linh Nguyen and Peifeng Yu and Mosharaf Chowdhury},
  Booktitle                = {ACM HotOS},
  Year                     = {2017},
  Month                    = {May},

  publist_confkey = {HotOS'17},
  publist_link = {paper || deepstack-hotos17.pdf},
  publist_topic = {Systems + AI},
  publist_abstract = {
    In recent years, deep learning has pervaded many areas of computing due to the confluence of an explosive growth of large-scale computing capabilities, availability of datasets, and advances in learning techniques.
    While this rapid growth has resulted in diverse deep learning frameworks, it has also led to inefficiencies for both the users and developers of these frameworks.
    Specifically, adopting useful techniques across frameworks -- both to perform learning tasks and to optimize performance -- involves significant repetitions and reinventions.

    In this paper, we observe that despite their diverse origins, many of these frameworks share architectural similarities.
    We argue that by introducing a common representation of learning tasks and a hardware abstraction model to capture compute heterogeneity, we might be able to relieve machine learning researchers from dealing with low-level systems issues and systems researchers from being tied to any specific framework.
    We expect this decoupling to accelerate progress in both domains.
  }
}

@InProceedings{eccache:osdi16,
  Title                    = {{EC-Cache}: Load-Balanced, Low-Latency Cluster Caching with Online Erasure Coding},
  Author                   = {K. V. Rashmi and Mosharaf Chowdhury and Jack Kosaian and Ion Stoica and Kannan Ramchandran},
  Booktitle                = {USENIX OSDI},
  Year                     = {2016},
  Month                    = {October},

  publist_confkey = {OSDI'16},
  publist_link = {paper || eccache-osdi16.pdf},
  publist_topic = {Big Data Systems},
  Abstract                 = {Data-intensive clusters and object stores are increasingly relying on in-memory object caching to meet the I/O performance demands. These systems routinely face the challenges of popularity skew, background load imbalance, and server failures, which result in severe load imbalance across storage servers and degraded I/O performance. Selective replication is a commonly used technique to tackle these challenges, where the number of cached replicas of an object is proportional to its popularity. In this paper, we explore an alternative approach using erasure coding.

EC-Cache is a load-balanced, low latency cluster cache that uses online erasure coding to overcome the limitations of selective replication. EC-Cache employs erasure coding by: (i) splitting and erasure coding individual objects during writes, and (ii) late binding, wherein obtaining any k out of (k + r) splits of an object are sufficient, during reads. As compared to selective replication, EC-Cache improves load balancing by more than 3X and reduces the median and tail read latencies by more than 2X, while using the same amount of memory. EC-Cache does so using 10% additional bandwidth and a small increase in the amount of stored metadata. The benefits offered by EC-Cache are further amplified in the presence of background network load imbalance and server failures.}
}

@InProceedings{coda:sigcomm16,
  Title                    = {{CODA}: Toward Automatically Identifying and Scheduling {CO}flows in the {DA}rk},
  Author                   = {Hong Zhang and Li Chen and Bairen Yi and Kai Chen and Mosharaf Chowdhury and Yanhui Geng},
  Booktitle                = {ACM SIGCOMM},
  Year                     = {2016},
  Month                    = {August},

  publist_confkey = {SIGCOMM'16},
  publist_link = {paper || coda-sigcomm16.pdf},
  publist_topic = {Datacenter Networking},
  Abstract                 = {Leveraging application-level requirements using coflows has recently been shown to improve application-level communication performance in data-parallel clusters. However, existing coflow-based solutions rely on modifying applications to extract coflows, making them inapplicable to many practical scenarios.

In this paper, we present CODA, a first attempt at automatically identifying and scheduling coflows without any application modifications. We employ an incremental clustering algorithm to perform fast, application-transparent coflow identification and complement it by proposing an error-tolerant coflow scheduler to mitigate occasional identification errors. Testbed experiments and large-scale simulations with production workloads show that CODA can identify coflows with over 90% accuracy, and its scheduler is robust to inaccuracies, enabling communication stages to complete 2.4X (5.1X) faster on average (95th percentile) compared to per-flow mechanisms. Overall, CODA's performance is comparable to that of solutions requiring application modifications.}
}

@InProceedings{hermes:sigcomm17,
  Title                    = {Resilient Datacenter Load Balancing in the Wild},
  Author                   = {Hong Zhang and Junxue Zhang and Wei Bai and Kai Chen and Mosharaf Chowdhury},
  Booktitle                = {ACM SIGCOMM},
  Year                     = {2017},
  Month                    = {August},

  publist_confkey = {SIGCOMM'17},
  publist_link = {paper || hermes-sigcomm17.pdf},
  publist_topic = {Datacenter Networking},
  Abstract                 = {Production datacenters operate under various uncertainties such as traffic dynamics, topology asymmetry, and failures. Therefore, datacenter load balancing schemes must be resilient to these uncertainties; i.e., they should accurately sense path conditions and timely react to mitigate the fallouts. Despite significant efforts, prior solutions have important drawbacks. On the one hand, solutions such as Presto and DRB are oblivious to path conditions and blindly reroute at fixed granularity. On the other hand, solutions such as CONGA and CLOVE can sense congestion, but they can only reroute when flowlets emerge; thus, they cannot always react timely to uncertainties. To make things worse, these solutions fail to detect/handle failures such as blackholes and random packet drops, which greatly degrades their performance.

In this paper, we introduce Hermes, a datacenter load balancer that is resilient to the aforementioned uncertainties. At its heart, Hermes leverages comprehensive sensing to detect path conditions including failures unattended before, and it reacts using timely yet cautious rerouting. Hermes is a practical edge-based solution with no switch modification. We have implemented Hermes with commodity switches and evaluated it through both testbed experiments and large-scale simulations. Our results show that Hermes achieves comparable performance to CONGA and Presto in normal cases, and well handles uncertainties: under asymmetries, Hermes achieves up to 10% and 20% better flow completion time (FCT) than CONGA and CLOVE; under switch failures, it outperforms all other schemes by over 32%.}
}

@InProceedings{frdma:kbnets17,
  Title                    = {Performance Isolation Anomalies in {RDMA}},
  Author                   = {Yiwen Zhang and Juncheng Gu and Youngmoon Lee and Mosharaf Chowdhury and Kang G. Shin},
  Booktitle                = {ACM SIGCOMMKBNets},
  Year                     = {2017},
  Month                    = {August},

  publist_confkey          = {KBNets'17},
  publist_link             = {paper || frdma-kbnets17.pdf},
  publist_link             = {slides || frdma-kbnets17-slides.pdf},
  publist_topic = {Datacenter Networking},
  Abstract                 = {To meet the increasing throughput and latency demands of modern applications, many operators are rapidly deploying RDMA in their datacenters. At the same time, developers are re-designing their software to take advantage of RDMA's benefits for individual applications. However, when it comes to RDMA's performance, many simple questions remain open.

In this paper, we consider the performance isolation characteristics of RDMA. Specifically, we conduct three sets of experiments -- three combinations of one throughput-sensitive flow and one latency-sensitive flow -- in a controlled environment, observe large discrepancies in RDMA performance with and without the presence of a competing flow, and describe our progress in identifying plausible root-causes.}
}

@InProceedings{pdd:apnet18,
  author    = {Hong Zhang and Kai Chen and Mosharaf Chowdhury},
  booktitle = {ACM APNet},
  title     = {Pas de deux: Shape the Circuits, and Shape the Apps too!},
  year      = {2018},
  pages     = {29--35},

  publist_confkey = {APNet'18},
  publist_link = {paper || pdd-apnet18.pdf},
  publist_topic = {Datacenter Networking},
  Abstract                 = {Despite continued efforts toward building high bandwidth, low cost datacenter networks with reconfigurable optical fabrics, the impact of optical networks on datacenter applications has received little attention. Given the constraints of optical networks and the semantics of datacenter applications, we believe the network-application intersection to be the next innovation hotspot. In this paper, we specifically focus on data-parallel applications for two primary reasons: they are a natural fit to exploit high bandwidth optical fabrics, and they often form structured communication patterns or coflows.

    We show that configuring circuits in reaction to changing traffic patterns is not enough. Efficient scheduling of even a single coflow in optical networks should be a "Pas de deux" – a joint shaping of not only the underlying circuit, but also the application’s traffic demand. Our preliminary evaluation with a production trace shows that joint shaping is on average within 1.18X of the optimal and performs 30% better than solu- tions that configure circuits in application-agnostic fashions. We further extend our analysis to inter-coflow scheduling and propose a layered solution that jointly considers circuit reconfiguration, coflow prioritization, as well as flow rate and route assignments.}
}

@InProceedings{aga:grades-nda18,
  author    = {Anand Padmanabha Iyer and Aurojit Panda and Shivaram Venkataraman and Mosharaf Chowdhury and Aditya Akella and Scott Shenker and Ion Stoica},
  booktitle = {ACM SIGMOD GRADES-NDA},
  title     = {Bridging the {GAP}: Towards Approximate Graph analytics},
  year      = {2018},

  publist_confkey = {GRADES-NDA'18},
  publist_link = {paper || aga-grades-nda18.pdf},
  publist_badge = {Best Paper Award},
  publist_topic = {Big Data Systems},
  Abstract                 = {While there has been a tremendous interest in processing data that has an underlying graph structure, existing distributed graph processing systems take several minutes or even hours to execute popular graph algorithms. However, in several cases, providing an approximate answer is good enough. Approximate analytics is seeing considerable attention in big data due to its ability to produce timely results by trading accuracy, but they do not support graph analytics. In this paper, we bridge this gap and take a first attempt at realizing approximate graph analytics. We discuss how traditional approximate analytics techniques do not carry over to the graph usecase. Leveraging the characteristics of graph properties and algorithms, we propose a graph sparsification technique, and a machine learning based approach to choose the apt amount of sparsification required to meet a given budget. Our preliminary evaluations show encouraging results.}
}

@InProceedings{monarch:hotcloud18,
  author    = {Anand Padmanabha Iyer and Aurojit Panda and Mosharaf Chowdhury and Aditya Akella and Scott Shenker and Ion Stoica},
  booktitle = {USENIX HotCloud},
  title     = {Monarch: Gaining Command on Geo-Distributed Graph Analytics},
  year      = {2018},

  publist_confkey = {HotCloud'18},
  publist_link = {paper || monarch-hotcloud18.pdf},
  publist_topic = {Wide-Area Computing},
  Abstract                 = {A number of existing and emerging application scenarios generate graph-structured data in a geo-distributed fashion. Although there is a lot of interest in distributed graph processing systems, none of them support geo-distributed graph processing. Geo-distributed analytics, on the other hand, has not focused on iterative workloads such as distributed graph processing.

    In this paper, we look at the problem of efficient geo-distributed graph analytics. We find that optimizing the iterative processing style of graph-parallel systems is the key to achieving this goal rather than extending existing geo-distributed techniques to graph processing. Based on this, we discuss our proposal on building Monarch, the first system to our knowledge that focuses on geo-distributed graph processing. Our preliminary evaluation of Monarch shows encouraging results.}
}

@InProceedings{relay:hotcloud18,
  author    = {Fan Lai and Mosharaf Chowdhury and Harsha V. Madhyastha},
  booktitle = {USENIX HotCloud},
  title     = {To Relay or Not to Relay for Inter-Cloud Transfers?},
  year      = {2018},

  publist_confkey = {HotCloud'18},
  publist_link = {paper || relay-hotcloud18.pdf},
  publist_link = {slides || relay-hotcloud18-slides.pdf},
  publist_topic = {Wide-Area Computing},
  Abstract    = {
    Efficient big data analytics over the wide-area network (WAN) is becoming increasingly more popular. Current geo-distributed analytics (GDA) systems employ WANaware optimizations to tackle WAN heterogeneities. Although extensive measurements on public clouds suggest the potential for improving inter-datacenter data transfers via detours, we show that such optimizations are unlikely to work in practice. This is because the widely accepted mantra used in a large body of literature – WAN bandwidth has high variability – can be misleading. Instead, our measurements across 40 datacenters belonging to Amazon EC2, Microsoft Azure, and Google Cloud Platform show that the available WAN bandwidth is often spatially homogeneous and temporally stable between two virtual machines (VMs) in different datacenters, even though it can be heterogeneous at the TCP flow level. Moreover, there is little scope for either bandwidth or latency optimization in a cost-effective manner via relaying. We believe that these findings will motivate the community to rethink the design rationales of GDA systems and geo-distributed services.
  }
}

@InProceedings{cellscope:mobicom18,
  author    = {Anand Padmanabha Iyer and Li Erran Li and Mosharaf Chowdhury and Ion Stoica},
  booktitle = {ACM MobiCom},
  title     = {Mitigating the Latency-Accuracy Trade-off in Mobile Data Analytics Systems},
  year      = {2018},
  pages     = {513--528},

  publist_confkey = {MobiCom'18},
  publist_link = {paper || cellscope-mobicom18.pdf},
  publist_topic = {Wide-Area Computing},
  Abstract                 = {An increasing amount of mobile analytics is performed on data that is procured in a real-time fashion to make real-time decisions. Such tasks include simple reporting on streams to sophisticated model building. However, the practicality of these analyses are impeded in several domains because they are faced with a fundamental trade-off between data collection latency and analysis accuracy.

    In this paper, we first study this trade-off in the context of a specific domain, Cellular Radio Access Networks (RAN). We find that the trade-off can be resolved using two broad, general techniques: intelligent data grouping and task formulations that leverage domain characteristics. Based on this, we present CellScope, a system that applies a domain specific formulation and application of Multi-task Learning (MTL) to RAN performance analysis. It uses three techniques: feature engineering to transform raw data into effective features, a PCA inspired similarity metric to group data from geographically nearby base stations sharing performance commonalities, and a hybrid online-offline model for efficient model updates. Our evaluation shows that CellScope's accuracy improvements over direct application of ML range from 2.5X to 4.4X while reducing the model update overhead by up to 4.8X. We have also used CellScope to analyze an LTE network of over 2 million subscribers, where it reduced troubleshooting efforts by several magnitudes.}
}

@InProceedings{qoop:osdi18,
  author    = {Kshiteej Mahajan and Mosharaf Chowdhury and Aditya Akella and Shuchi Chawla},
  booktitle = {USENIX OSDI},
  title     = {Dynamic Query Re-Planning using {QOOP}},
  year      = {2018},
  pages     = {253--267},

  publist_confkey = {OSDI'18},
  publist_link = {paper || qoop-osdi18.pdf},
  publist_topic = {Big Data Systems},
  Abstract                 = {Modern data processing clusters are highly dynamic – both in terms of the number of concurrently running jobs and their resource usage. To improve job performance, recent works have focused on optimizing the cluster scheduler and the jobs' query planner with a focus on picking the right query execution plan (QEP) – represented as a directed acyclic graph – for a job in a resource-aware manner, and scheduling jobs in a QEP-aware manner. However, because existing solutions use a fixed QEP throughout the entire execution, the inability to adapt a QEP in reaction to resource changes often leads to large performance inefficiencies.

    This paper argues for dynamic query re-planning, wherein we re-evaluate and re-plan a job's QEP during its execution. We show that designing for re-planning requires fundamental changes to the interfaces between key layers of data analytics stacks today, i.e., the query planner, the execution engine, and the cluster scheduler. Instead of pushing more complexity into the scheduler or the query planner, we argue for a redistribution of responsibilities between the three components to simplify their designs. Under this redesign, we analytically show that a greedy algorithm for re-planning and execution alongside a simple max-min fair scheduler can offer provably competitive behavior even under adversarial resource changes. We prototype our algorithms atop Apache Hive and Tez. Via extensive experiments, we show that our design can offer a median performance improvement of 1.47X compared to state-of-the-art alternatives.}
}

@InProceedings{dslr:sigmod18,
  author    = {Dong Young Yoon and Mosharaf Chowdhury and Barzan Mozafari},
  booktitle = {ACM SIGMOD},
  title     = {Distributed Lock Management with {RDMA}: Decentralization without Starvation},
  year      = {2018},
  pages     = {1571--1586},

  publist_confkey = {SIGMOD'18},
  publist_link = {paper || dslr-sigmod18.pdf},
  publist_topic = {Datacenter Networking},
  publist_topic = {Disaggregation},
  Abstract                 = {Lock managers are a crucial component of modern distributed systems. However, with the increasing availability of fast RDMA-enabled networks, traditional lock managers can no longer keep up with the latency and throughput requirements of modern systems. Centralized lock managers can ensure fairness and prevent starvation using global knowledge of the system, but are themselves single points of contention and failure. Consequently, they fall short in leveraging the full potential of RDMA networks. On the other hand, decentralized (RDMA-based) lock managers either completely sacrifice global knowledge to achieve higher throughput at the risk of starvation and higher tail latencies, or they resort to costly communications in order to maintain global knowledge, which can result in significantly lower throughput.

    In this paper, we show that it is possible for a lock manager to be fully decentralized and yet exchange the partial knowledge necessary for preventing starvation and thereby reducing tail latencies. Our main observation is that we can design a lock manager primarily using RDMA's fetch-and-add (FA) operations, which always succeed, rather than compare-and-swap (CAS) operations, which only succeed if a given condition is satisfied. While this requires us to rethink the locking mechanism from the ground up, it enables us to sidestep the performance drawbacks of the previous CAS-based proposals that relied solely on blind retries upon lock conflicts.

    Specifically, we present DSLR (Decentralized and Starvation-free Lock management with RDMA), a decentralized lock manager that targets distributed systems running on RDMA-enabled networks. We demonstrate that, despite being fully decentralized, DSLR prevents starvation and blind retries by guaranteeing first-come-first-serve (FCFS) scheduling without maintaining explicit queues. We adapt Lamport's bakery algorithm [36] to an RDMA-enabled environment with multiple bakers, utilizing only one-sided READ and atomic FA operations. Our experiments show that, on average, DSLR delivers 1.8X (and up to 2.8X) higher throughput than all existing RDMA-based lock managers, while reducing their mean and 99.9% latencies by 2.0X and 18.3X (and up to 2.5X and 47X), respectively.}
}

@InProceedings{allox:mama18,
  author    = {Xiao Sun and Tan N. Le and Mosharaf Chowdhury and Zhenhua Liu},
  booktitle = {ACM SIGMETRICS MAMA},
  title     = {Fair Allocation of Heterogeneous and Interchangeable Resources},
  year      = {2018},

  publist_confkey = {MAMA'18},
  publist_link = {paper || allox-mama18.pdf},
  publist_topic = {Systems + AI},
  Abstract                 = {Motivated by the proliferation of heterogeneous processors such as multi-core CPUs, GPUs, TPUs, and other accelerators for machine learning, we formulate a novel multi-interchangeable resource allocation (MIRA) problem where some resources are interchangeable. The challenge is how to allocate interchangeable resources to users in a sharing system while maintaining desirable properties such as sharing incentive, Pareto efficiency, and envy-freeness. In this paper, we first show that existing algorithms, including the Dominant Resource Fairness used in production systems, fail to provide these properties for interchangeable resources. Then we characterize the tradeoff between performance and strategyproofness, and design the Budget-based (BUD) algorithm, which preserves Pareto efficiency, sharing incentive and envy-freeness while providing better performance over currently used algorithms.
}
}

@InProceedings{tiresias:nsdi19,
  author    = {Juncheng Gu and Mosharaf Chowdhury and Kang G. Shin and Yibo Zhu and Myeongjae Jeon and Junjie Qian and Hongqiang Harry Liu and Chuanxiong Guo},
  booktitle = {USENIX NSDI},
  title     = {Tiresias: A {GPU} Cluster Manager for Distributed Deep Learning},
  year      = {2019},
  pages     = {485--500},

  publist_confkey = {NSDI'19},
  publist_link = {paper || tiresias-nsdi19.pdf},
  publist_link = {slides || tiresias-nsdi19-slides.pdf},
  publist_link = {pps || tiresias-nsdi19-slides.ppsx},
  publist_link = {code || https://github.com/SymbioticLab/Tiresias},
  publist_topic = {Systems + AI},
  publist_abstract = {
  Deep learning (DL) training jobs bring some unique challenges to existing cluster managers, such as unpredictable training times, an all-or-nothing execution model, and inflexibility in GPU sharing. Our analysis of a large GPU cluster in production shows that existing big data schedulers cause long queueing delays and low overall performance.

  We present Tiresias, a GPU cluster manager tailored for distributed DL training jobs, which efficiently schedules and places DL jobs to reduce their job completion times (JCTs). Given that a DL job’s execution time is often unpredictable, we propose two scheduling algorithms – Discretized Two-Dimensional Gittins index relies on partial information and Discretized Two-Dimensional LAS is information-agnostic – that aim to minimize the average JCT. Additionally, we describe when the consolidated placement constraint can be relaxed, and present a placement algorithm to leverage these observations without any user input. Experiments on the Michigan ConFlux cluster with 60 P100 GPUs and large-scale trace-driven simulations show that Tiresias improves the average JCT by up to 5.5× over an Apache YARN-based resource manager used in production. More importantly, Tiresias’s performance is comparable to that of solutions assuming perfect knowledge.}
}

@InProceedings{nocs:spaa19,
  author    = {Mosharaf Chowdhury and Samir Khuller and Manish Purohit and Sheng Yang and Jie You},
  booktitle = {ACM SPAA},
  title     = {Near Optimal Coflow Scheduling in Networks},
  year      = {2019},
  pages     = {123--134},

  publist_confkey = {SPAA'19},
  publist_link = {paper || nocs-spaa19.pdf},
  publist_topic = {Wide-Area Computing},
  publist_abstract  = {The coflow scheduling problem has emerged as a popular abstraction in the last few years to study data communication problems within a data center. In this basic framework, each coflow has a set of communication demands and the goal is to schedule many coflows in a manner that minimizes the total weighted completion time. A coflow is said to complete when all its communication needs are met. This problem has been extremely well studied for the case of complete bipartite graphs that model a data center with full bisection bandwidth and several approximation algorithms and effective heuristics have been proposed recently. In this work, we study a slightly different model of coflow scheduling in general graphs (to capture traffic between data centers) and develop practical and efficient approximation algorithms for it. Our main result is a randomized 2 approximation algorithm for the single path and free path model, significantly improving prior work. In addition, we demonstrate via extensive experiments that the algorithm is practical, easy to implement and performs well in practice.
  }
}

@Article{memtrade:arxiv21,
  author        = {Hasan Al Maruf and Yuhong Zhong and Hongyi Wang and Mosharaf Chowdhury and Asaf Cidon and Carl Waldspurger},
  journal       = {CoRR},
  title         = {Memtrade: A Disaggregated-Memory Marketplace for Public Clouds},
  year          = {2021},
  month         = {Aug},
  volume        = {abs/2108.06893},
  archiveprefix = {arXiv},
  eprint        = {2108.06893},
  url           = {https://arxiv.org/abs/2108.06893},
  publist_confkey = {arXiv:2108.06893},
  publist_link = {paper || https://arxiv.org/abs/2108.06893},
  publist_topic = {Disaggregation},
  publist_abstract = {
    We present Memtrade, the first memory disaggregation system for public clouds. Public clouds introduce a set of unique challenges for resource disaggregation across different tenants, including security, isolation and pricing. Memtrade allows producer virtual machines (VMs) to lease both their unallocated memory and allocated-but-idle application memory to remote consumer VMs for a limited period of time. Memtrade does not require any modifications to host-level system software or support from the cloud provider. It harvests producer memory using an application-aware control loop to form a distributed transient remote memory pool with minimal performance impact; it employs a broker to match producers with consumers while satisfying performance constraints; and it exposes the matched memory to consumers as a secure KV cache. Our evaluation using real-world cluster traces shows that Memtrade provides significant performance benefit for consumers (improving average read latency up to 2.8x) while preserving confidentiality and integrity, with little impact on producer applications (degrading performance by less than 2.1%).
  }
}

@Article{tpp:arxiv22,
  author        = {Hasan Al Maruf and Hao Wang and Abhishek Dhanotia and Johannes Weiner and Niket Agarwal and Pallab Bhattacharya and Chris Petersen and Mosharaf Chowdhury and Shobhit Kanaujia and Prakash Chauhan},
  journal       = {CoRR},
  title         = {{TPP}: Transparent Page Placement for {CXL}-Enabled Tiered Memory},
  year          = {2022},
  month         = {Jun},
  volume        = {abs/2206.02878},
  archiveprefix = {arXiv},
  eprint        = {2206.02878},
  url           = {https://arxiv.org/abs/2206.02878},
  publist_confkey = {arXiv:2206.02878},
  publist_link = {paper || https://arxiv.org/abs/2206.02878},
  publist_topic = {Disaggregation},
  publist_abstract = {
    With increasing memory demands for datacenter applications and the emergence of coherent interfaces like CXL that enable main memory expansion, we are about to observe a wide adoption of tiered-memory subsystems in hyperscalers. In such systems, main memory can constitute different memory technologies with varied performance characteristics. In this paper, we characterize the memory usage of a wide range of datacenter applications across the server fleet of a hyperscaler (Meta) to get insights into an application's memory access patterns and performance on a tiered memory system. Our characterizations show that datacenter applications can benefit from tiered memory systems as there exist opportunities for offloading colder pages to slower memory tiers. Without efficient memory management, however, such systems can significantly degrade performance.
    We propose a novel OS-level application-transparent page placement mechanism (TPP) for efficient memory management. TPP employs a lightweight mechanism to identify and place hot and cold pages to appropriate memory tiers. It enables page allocation to work independently from page reclamation logic that is, otherwise, tightly coupled in today's Linux kernel. As a result, the local memory tier has memory headroom for new allocations. At the same time, TPP can promptly promote performance-critical hot pages trapped in the slow memory tiers to the fast tier node. Both promotion and demotion mechanisms work transparently without any prior knowledge of an application's memory access behavior. We evaluate TPP with diverse workloads that consume significant portions of DRAM on Meta's server fleet and are sensitive to memory subsystem performance. TPP's efficient page placement improves Linux's performance by up to 18%. TPP outperforms NUMA balancing and AutoTiering, state-of-the-art solutions for tiered memory, by 10-17%.
  }
}

@Article{egeria:arxiv22,
  author        = {Yiding Wang and Decang Sun and Kai Chen and Fan Lai and Mosharaf Chowdhury},
  journal       = {CoRR},
  title         = {Efficient {DNN} Training with Knowledge-Guided Layer Freezing},
  year          = {2022},
  month         = {Jan},
  volume        = {abs/2201.06227},
  archiveprefix = {arXiv},
  eprint        = {2201.06227},
  url           = {https://arxiv.org/abs/2201.06227},
  publist_confkey = {arXiv:2201.06227},
  publist_link = {paper || https://arxiv.org/abs/2201.06227},
  publist_topic = {Systems + AI},
  publist_abstract = {
    Training deep neural networks (DNNs) is time-consuming. While most existing solutions try to overlap/schedule computation and communication for efficient training, this paper goes one step further by skipping computing and communication through DNN layer freezing. Our key insight is that the training progress of internal DNN layers differs significantly, and front layers often become well-trained much earlier than deep layers. To explore this, we first introduce the notion of training plasticity to quantify the training progress of internal DNN layers. Then we design KGT, a knowledge-guided DNN training system that employs semantic knowledge from a reference model to accurately evaluate individual layers' training plasticity and safely freeze the converged ones, saving their corresponding backward computation and communication. Our reference model is generated on the fly using quantization techniques and runs forward operations asynchronously on available CPUs to minimize the overhead. In addition, KGT caches the intermediate outputs of the frozen layers with prefetching to further skip the forward computation. Our implementation and testbed experiments with popular vision and language models show that KGT achieves 19%-43% training speedup w.r.t. the state-of-the-art without sacrificing accuracy.
  }
}

@Article{treehouse-whitepaper:arxiv22,
  author        = {Thomas Anderson and Adam Belay and Mosharaf Chowdhury and Asaf Cidon and Irene Zhang},
  journal       = {CoRR},
  title         = {Treehouse: A Case For Carbon-Aware Datacenter Software},
  year          = {2022},
  month         = {Jan},
  volume        = {abs/2201.02120},
  archiveprefix = {arXiv},
  eprint        = {2201.02120},
  url           = {https://arxiv.org/abs/2201.02120},
  publist_confkey = {arXiv:2201.02120},
  publist_link = {paper || https://arxiv.org/abs/2201.02120},
  publist_topic = {Energy-Efficient Systems},
  publist_topic = {Disaggregation},
  publist_abstract = {
    The end of Dennard scaling and the slowing of Moore's Law has put the energy use of datacenters on an unsustainable path. Datacenters are already a significant fraction of worldwide electricity use, with application demand scaling at a rapid rate. We argue that substantial reductions in the carbon intensity of datacenter computing are possible with a software-centric approach: by making energy and carbon visible to application developers on a fine-grained basis, by modifying system APIs to make it possible to make informed trade offs between performance and carbon emissions, and by raising the level of application programming to allow for flexible use of more energy efficient means of compute and storage. We also lay out a research agenda for systems software to reduce the carbon footprint of datacenter computing.
  }
}

@Article{autops:arxiv22,
  author        = {Juncheng Gu and Mosharaf Chowdhury and Kang G. Shin and Aditya Akella},
  journal       = {CoRR},
  title         = {Elastic Model Aggregation with Parameter Service},
  year          = {2022},
  month         = {Apr},
  volume        = {abs/2204.03211},
  archiveprefix = {arXiv},
  eprint        = {2204.03211},
  url           = {https://arxiv.org/abs/2204.03211},
  publist_confkey = {arXiv:2204.03211},
  publist_link = {paper || https://arxiv.org/abs/2204.03211},
  publist_topic = {Systems + AI},
  publist_abstract = {
    Model aggregation, the process that updates model parameters, is an important step for model convergence in distributed deep learning (DDL). However, the parameter server (PS), a popular paradigm of performing model aggregation, causes CPU underutilization in deep learning (DL) clusters, due to the bursty nature of aggregation and static resource allocation. To remedy this problem, we propose Parameter Service, an elastic model aggregation framework for DDL training, which decouples the function of model aggregation from individual training jobs and provides a shared model aggregation service to all jobs in the cluster. In Parameter Service, model aggregations are efficiently packed and dynamically migrated to fit into the available CPUs with negligible time overhead. Furthermore, Parameter Service can elastically manage its CPU resources based on its load to enhance resource efficiency. We have implemented Parameter Service in a prototype system called AutoPS and evaluated it via testbed experimentation and trace-driven simulations. AutoPS reduces up to 75% of CPU consumption with little or no performance impact on the training jobs. The design of Parameter Service is transparent to the users and can be incorporated in popular DL frameworks.
  }
}

@Article{oort:arxiv20,
  author        = {Fan Lai and Xiangfeng Zhu and Harsha V. Madhyastha and Mosharaf Chowdhury},
  journal       = {CoRR},
  title         = {Oort: Informed Participant Selection for Scalable Federated Learning},
  year          = {2020},
  month         = {Oct},
  volume        = {abs/2010.06081},
  archiveprefix = {arXiv},
  bibsource     = {dblp computer science bibliography, https://dblp.org},
  biburl        = {https://dblp.org/rec/journals/corr/abs-2010-06081.bib},
  eprint        = {2010.06081},
  url           = {https://arxiv.org/abs/2010.06081},

  publist_confkey = {arXiv:2010.06081},
  publist_link = {paper || https://arxiv.org/abs/2010.06081},
  publist_topic = {Wide-Area Computing},
  publist_topic = {Systems + AI},
  publist_abstract = {
    Federated Learning (FL) is an emerging direction in distributed machine learning (ML) that enables in-situ model training and testing on edge data. Despite having the same end goals as traditional ML, FL executions differ significantly in scale, spanning thousands to millions of participating devices. As a result, data characteristics and device capabilities vary widely across clients. Yet, existing efforts randomly select FL participants, which leads to poor model and system efficiency.

    In this paper, we propose Kuiper to improve the performance of federated training and testing with guided participant selection. With an aim to improve time-to-accuracy performance in model training, Kuiper prioritizes the use of those clients who have both data that offers the greatest utility in improving model accuracy and the capability to run training quickly. To enable FL developers to interpret their results in model testing, Kuiper enforces their requirements on the distribution of participant data while improving the duration of federated testing by cherry-picking clients. Our evaluation shows that, compared to existing participant selection mechanisms, Kuiper improves time-to-accuracy performance by 1.2x-14.1x and final model accuracy by 1.3%-9.8%, while efficiently enforcing developer-specified model testing criteria at the scale of millions of clients.
  }
}

@Article{salus:arxiv19,
  author        = {Peifeng Yu and Mosharaf Chowdhury},
  journal       = {CoRR},
  title         = {Salus: Fine-Grained {GPU} Sharing Primitives for Deep Learning Applications},
  year          = {2019},
  month         = {Feb},
  volume        = {abs/1902.04610},
  archiveprefix = {arXiv},
  bibsource     = {dblp computer science bibliography, https://dblp.org},
  biburl        = {https://dblp.org/rec/journals/corr/abs-1902-04610.bib},
  eprint        = {1902.04610},
  url           = {https://arxiv.org/abs/1902.04610},

  publist_confkey = {arXiv:1902.04610},
  publist_link = {paper || https://arxiv.org/abs/1902.04610},
  publist_topic = {Systems + AI},
  publist_abstract = {
    GPU computing is becoming increasingly more popular with the proliferation of deep learning (DL) applications.
    However, unlike traditional resources such as CPU or the network, modern GPUs do not natively support fine-grained sharing primitives.
    Consequently, implementing common policies such as time sharing and preemption are expensive.
    Worse, when a DL application cannot completely use a GPU's resources, the GPU cannot be efficiently shared between multiple applications, leading to GPU underutilization.

    We present Salus to enable two GPU sharing primitives: fast job switching and memory sharing, in order to achieve fine-grained GPU sharing among multiple DL applications.
    Salus implements an efficient, consolidated execution service that exposes the GPU to different DL applications, and enforces fine-grained sharing by performing iteration scheduling and addressing associated memory management issues.
    We show that these primitives can then be used to implement flexible sharing policies such as fairness, prioritization, and packing for various use cases.
    Our integration of Salus with TensorFlow and evaluation on popular DL jobs show that Salus can improve the average completion time of DL training jobs by $3.19\times$,
    GPU utilization for hyper-parameter tuning by $2.38\times$,
    and GPU utilization of DL inference applications by $42\times$ over not sharing the GPU and $7\times$ over NVIDIA MPS with small overhead.
  }
}

@Article{terra:arxiv19,
  author        = {Jie You and Mosharaf Chowdhury},
  journal       = {CoRR},
  title         = {Terra: Scalable Cross-Layer {GDA} Optimizations},
  year          = {2019},
  month         = {Apr},
  volume        = {abs/1904.08480},
  archiveprefix = {arXiv},
  bibsource     = {dblp computer science bibliography, https://dblp.org},
  biburl        = {https://dblp.org/rec/journals/corr/abs-1904-08480.bib},
  eprint        = {1904.08480},
  url           = {https://arxiv.org/abs/1904.08480},
  publist_confkey = {arXiv:1904.08480},
  publist_link = {paper || https://arxiv.org/abs/1904.08480},
  publist_topic = {Wide-Area Computing},
  publist_abstract  = {Geo-distributed analytics (GDA) frameworks transfer large datasets over the wide-area network (WAN). Yet existing frameworks often ignore the WAN topology. This disconnect between WAN-bound applications and the WAN itself results in missed opportunities for cross-layer optimizations. In this paper, we present Terra to bridge this gap. Instead of decoupled WAN routing and GDA transfer scheduling, Terra applies scalable cross-layer optimizations to minimize WAN transfer times for GDA jobs. We present a two-pronged approach: (i) a scalable algorithm for joint routing and scheduling to make fast decisions; and (ii) a scalable, overlay-based enforcement mechanism that avoids expensive switch rule updates in the WAN. Together, they enable Terra to quickly react to WAN uncertainties such as large bandwidth fluctuations and failures in an application-aware manner as well. Integration with the FloodLight SDN controller and Apache YARN, and evaluation on 4 workloads and 3 WAN topologies show that Terra improves the average completion times of GDA jobs by 1.55x-3.43x. GDA jobs running with Terra meets 2.82x-4.29x more deadlines and can quickly react to WAN-level events in an application-aware manner.
  }
}

@Article{justitia:arxiv19,
  author        = {Yiwen Zhang and Yue Tan and Brent Stephens and Mosharaf Chowdhury},
  journal       = {CoRR},
  title         = {{RDMA} Performance Isolation With {Justitia}},
  year          = {2019},
  month         = {May},
  volume        = {abs/1905.04437},
  archiveprefix = {arXiv},
  bibsource     = {dblp computer science bibliography, https://dblp.org},
  biburl        = {https://dblp.org/rec/journals/corr/abs-1905-04437.bib},
  eprint        = {1905.04437},
  url           = {https://arxiv.org/abs/1905.04437},

  publist_confkey = {arXiv:1905.04437},
  publist_link = {paper || https://arxiv.org/abs/1905.04437},
  publist_topic = {Datacenter Networking},
  publist_topic = {Disaggregation},
  publist_abstract = {
    Despite its increasing popularity, most of RDMA’s benefits such as ultra-low latency can be achieved only when running an application in isolation. Using microbenchmarks and real open-source RDMA applications, we identify a series of performance anomalies when multiple applications coexist and show that such anomalies are pervasive across InfiniBand, RoCEv2, and iWARP. They arise due to a fundamental tradeoff between performance isolation and work conservation, which the state-of-the-art RDMA congestion control protocols such as DCQCN cannot resolve.

    We present Justitia to address these performance anomalies. Justitia is a software-only, host-based, and easy-to-deploy solution that maximizes RNIC utilization while guaranteeing performance isolation via shaping, rate limiting, and pacing at senders. Our evaluation of Justitia on multiple RDMA implementations show that Justitia effectively isolates different types of traffic and significantly improves latency (by up to 56.9×) and throughput (by up to 9.7×) of real-world RDMAbased applications without compromising low CPU usage or modifying the applications.
  }
}

@Article{nocs:arxiv19,
  author        = {Mosharaf Chowdhury and Samir Khuller and Manish Purohit and Sheng Yang and Jie You},
  journal       = {CoRR},
  title         = {Near Optimal Coflow Scheduling in Networks},
  year          = {2019},
  month         = {Jun},
  volume        = {abs/1906.06851},
  archiveprefix = {arXiv},
  bibsource     = {dblp computer science bibliography, https://dblp.org},
  biburl        = {https://dblp.org/rec/journals/corr/abs-1906-06851.bib},
  eprint        = {1906.06851},
  url           = {https://arxiv.org/abs/1906.06851},
  publist_confkey = {arXiv:1906.06851},
  publist_link = {paper || https://arxiv.org/abs/1906.06851},
  publist_topic = {Wide-Area Computing},
  publist_abstract = { The coflow scheduling problem has emerged as a popular abstraction in the last few years to study data communication problems within a data center. In this basic framework, each coflow has a set of communication demands and the goal is to schedule many coflows in a manner that minimizes the total weighted completion time. A coflow is said to complete when all its communication needs are met. This problem has been extremely well studied for the case of complete bipartite graphs that model a data center with full bisection bandwidth and several approximation algorithms and effective heuristics have been proposed recently.
In this work, we study a slightly different model of coflow scheduling in general graphs (to capture traffic between data centers) and develop practical and efficient approximation algorithms for it. Our main result is a randomized 2 approximation algorithm for the single path and free path model, significantly improving prior work. In addition, we demonstrate via extensive experiments that the algorithm is practical, easy to implement and performs well in practice.
}
}

@Article{hydra:arxiv19,
  author        = {Youngmoon Lee and Hasan Al Maruf and Mosharaf Chowdhury and Asaf Cidon and Kang G. Shin},
  journal       = {CoRR},
  title         = {Mitigating the Performance-Efficiency Tradeoff in Resilient Memory Disaggregation},
  year          = {2019},
  month         = {Oct},
  volume        = {abs/1910.09727},
  archiveprefix = {arXiv},
  bibsource     = {dblp computer science bibliography, https://dblp.org},
  biburl        = {https://dblp.org/rec/journals/corr/abs-1910-09727.bib},
  eprint        = {1910.09727},
  url           = {https://arxiv.org/abs/1910.09727},

  publist_confkey = {arXiv:1910.09727},
  publist_link = {paper || https://arxiv.org/abs/1910.09727},
  publist_topic = {Disaggregation},
  publist_abstract = {
    Memory disaggregation has received attention in recent years as a promising idea to reduce the total cost of ownership (TCO) of memory in modern datacenters. However, relying on remote memory expands an application’s failure domain and makes it susceptible to tail latency variations. In attempts to making disaggregated memory resilient, stateof-the-art solutions face the classic tradeoff between performance and efficiency: some double the memory overhead of disaggregation by replicating to remote memory, while many others limit performance by replicating to the local disk.

    We present Hydra, a configurable, erasure-coded resilience mechanism for common memory disaggregation solutions. It can transparently handle uncertainties arising from remote failures, evictions, memory corruptions, and stragglers from network imbalance with a significantly better performanceefficiency tradeoff than the state-of-the-art. We design a finetuned data path to achieve single µs read/write latency to remote memory, develop decentralized algorithms for clusterwide memory management, and analyze how to select parameters to mitigate independent and correlated uncertainties. Our integration of Hydra with two major memory disaggregation systems and evaluation on a 50-machine RDMA cluster demonstrates that it achieves the best of both worlds: it improves the latency and throughput of memory-intensive applications by up to 64.78× and 20.61×, respectively, over the state-of-the-art disk backup-based solution. At the same time, it provides performance similar to that of in-memory replication with 1.6× lower memory overhead.
  }
}

@Article{bopf:arxiv19,
  author        = {Tan N. Le and Xiao Sun and Mosharaf Chowdhury and Zhenhua Liu},
  journal       = {CoRR},
  title         = {{BoPF}: Mitigating the Burstiness-Fairness Tradeoff in Multi-Resource Clusters},
  year          = {2019},
  month         = {Dec},
  volume        = {abs/1912.03523},
  archiveprefix = {arXiv},
  bibsource     = {dblp computer science bibliography, https://dblp.org},
  biburl        = {https://dblp.org/rec/journals/corr/abs-1912-03523.bib},
  eprint        = {1912.03523},
  url           = {https://arxiv.org/abs/1912.03523},

  publist_confkey = {arXiv:1912.03523},
  publist_link = {paper || https://arxiv.org/abs/1912.03523},
  publist_topic = {Big Data Systems},
  publist_abstract = {
    Simultaneously supporting latency- and throughout-sensitive workloads in a shared environment is an increasingly more common challenge in big data clusters. Despite many advances, existing cluster schedulers force the same performance goal - fairness in most cases - on all jobs. Latency-sensitive jobs suffer, while throughput-sensitive ones thrive. Using prioritization does the opposite: it opens up a path for latency-sensitive jobs to dominate. In this paper, we tackle the challenges in supporting both short-term performance and long-term fairness simultaneously with high resource utilization by proposing Bounded Priority Fairness (BoPF). BoPF provides short-term resource guarantees to latency-sensitive jobs and maintains long-term fairness for throughput-sensitive jobs. BoPF is the first scheduler that can provide long-term fairness, burst guarantee, and Pareto efficiency in a strategyproof manner for multi-resource scheduling. Deployments and large-scale simulations show that BoPF closely approximates the performance of Strict Priority as well as the fairness characteristics of DRF. In deployments, BoPF speeds up latency-sensitive jobs by 5.38 times compared to DRF, while still maintaining long-term fairness. In the meantime, BoPF improves the average completion times of throughput-sensitive jobs by up to 3.05 times compared to Strict Priority.
  }
}

@InProceedings{allox:eurosys20,
  author    = {Tan N. Le and Xiao Sun and Mosharaf Chowdhury and Zhenhua Liu},
  booktitle = {ACM EuroSys},
  title     = {{AlloX}: Compute Allocation in Hybrid Clusters},
  year      = {2020},
  pages     = {31:1--31:16},

  publist_confkey = {EuroSys'20},
  publist_link = {paper || allox-eurosys20.pdf},
  publist_link = {code || https://github.com/lenhattan86/allox},
  publist_topic = {Systems + AI},
  publist_abstract = {
    Modern deep learning frameworks support a variety of hardware, including CPU, GPU, and other accelerators, to perform computation. In this paper, we study how to schedule jobs over such interchangeable resources – each with a different rate of computation – to optimize performance while providing fairness among users in a shared cluster. We demonstrate theoretically and empirically that existing solutions and their straightforward modifications perform poorly in the presence of interchangeable resources, which motivates the design and implementation of AlloX. At its core, AlloX transforms the scheduling problem into a min-cost bipartite matching problem and provides dynamic fair allocation over time. We theoretically prove its optimality in an ideal, offline setting and show empirically that it works well in the online scenario by incorporating with Kubernetes. Evaluations on a small-scale CPU-GPU hybrid cluster and large-scale simulations highlight that AlloX can reduce the average job completion time significantly (by up to 95% when the system load is high) while providing fairness and preventing starvation.  }
}

@InProceedings{salus:mlsys20,
  author    = {Peifeng Yu and Mosharaf Chowdhury},
  booktitle = {MLSys},
  title     = {Salus: Fine-Grained {GPU} Sharing Primitives for Deep Learning Applications},
  year      = {2020},

  publist_confkey = {MLSys'20},
  publist_link = {paper || salus-mlsys20.pdf},
  publist_link = {slides || salus-mlsys20-talk.pptm},
  publist_link = {poster || salus-mlsys20-poster.pdf},
  publist_badge = {Artifacts Available},
  publist_badge = {Artifacts Functional},
  publist_badge = {Results Reproduced},
  publist_topic = {Systems + AI},
  publist_abstract = {
    Unlike traditional resources such as CPU or the network, modern GPUs do not natively support
    fine-grained sharing primitives.
    Consequently, implementing common policies such as time sharing and preemption are expensive. Worse,
    when a deep learning (DL) application cannot completely use a GPU's resources, the GPU cannot be efficiently shared
    between multiple applications, leading to GPU underutilization.

    We present Salus to enable two GPU sharing primitives: __fast job
    switching__ and __memory sharing__, to achieve fine-grained GPU sharing
    among multiple DL applications. Salus is an efficient, consolidated
    execution service that exposes the GPU to different DL applications, and it
    enforces fine-grained sharing by performing iteration scheduling and
    addressing associated memory management issues. We show that these primitives
    can then be used to implement flexible sharing policies. Our integration of
    Salus with TensorFlow and evaluation on popular DL jobs shows that Salus
    can improve the average completion time of DL training jobs by $3.19\times$, GPU utilization for
    hyper-parameter tuning by $2.38\times$, and GPU utilization of DL inference applications by $42\times$ over not sharing
    the GPU and $6\times$ over NVIDIA MPS with small overhead.
  }
}

@InProceedings{sol:nsdi20,
  author    = {Fan Lai and Jie You and Xiangfeng Zhu and Harsha V. Madhyastha and Mosharaf Chowdhury},
  booktitle = {USENIX NSDI},
  title     = {Sol: Fast Distributed Computation Over Slow Networks},
  year      = {2020},
  pages     = {273--288},

  publist_confkey = {NSDI'20},
  publist_link = {paper || sol-nsdi20.pdf},
  publist_link = {slides || sol-nsdi20-slides.pdf},
  publist_link = {code || https://github.com/SymbioticLab/Sol},
  publist_topic = {Wide-Area Computing},
  publist_abstract = {
    The popularity of big data and AI has led to many optimizations at different layers of distributed computation stacks. Despite – or perhaps, because of – its role as the narrow waist of such software stacks, the design of the execution engine, which is in charge of executing every single task of a job, has mostly remained unchanged. As a result, the execution engines available today are ones primarily designed for low latency and high bandwidth datacenter networks. When either or both of the network assumptions do not hold, CPUs are significantly underutilized.

    In this paper, we take a first-principles approach toward developing an execution engine that can adapt to diverse network conditions. Sol, our federated execution engine architecture, flips the status quo in two respects. First, to mitigate the impact of high latency, Sol proactively assigns tasks, but does so judiciously to be resilient to uncertainties. Second, to improve the overall resource utilization, Sol decouples communication from computation internally instead of committing resources to both aspects of a task simultaneously. Our evaluations on EC2 show that, compared to Apache Spark in resource-constrained networks, Sol improves SQL and machine learning jobs by $16.4\times$ and $4.2\times$ on average.
  }
}

@InProceedings{pando:nsdi20,
  author    = {Muhammed Uluyol and Anthony Huang and Ayush Goel and Mosharaf Chowdhury and Harsha V. Madhyastha},
  booktitle = {USENIX NSDI},
  title     = {Near-Optimal Latency Versus Cost Tradeoffs in Geo-Distributed Storage},
  year      = {2020},
  pages     = {157--180},

  publist_confkey = {NSDI'20},
  publist_link = {paper || pando-nsdi20.pdf},
  publist_topic = {Wide-Area Computing},
  publist_abstract = {
    By replicating data across sites in multiple geo- graphic regions, web services can maximize availability and minimize latency for their users. However, when sacrificing data consistency is not an option, we show that service providers have to today incur significantly higher cost to meet desired la- tency goals than the lowest cost theoretically feasible. We show that the key to addressing this sub-optimality is to 1) allow for erasure coding, not just replication, of data across data cen- ters, and 2) mitigate the resultant increase in read and write la- tencies by rethinking how to enable consensus across the wide- area network. Our extensive evaluation mimicking web service deployments on the Azure cloud service shows that we enable near-optimal latency versus cost tradeoffs.
  }
}

@InProceedings{justitia:nsdi22,
  author    = {Yiwen Zhang and Yue Tan and Brent Stephens and Mosharaf Chowdhury},
  booktitle = {USENIX NSDI},
  title     = {{Justitia}: Software Multi-Tenancy in Hardware Kernel-Bypass Networks},
  year      = {2022},

  publist_confkey = {NSDI'22},
  publist_link = {paper || justitia-nsdi22.pdf},
  publist_link = {code || https://github.com/SymbioticLab/Justitia},
  publist_topic = {Datacenter Networking},
  publist_topic = {Disaggregation},
  publist_abstract = {
    Kernel-bypass networking (KBN) is becoming the new norm in modern datacenters.
    While hardware-based KBN offloads all dataplane tasks to specialized NICs to achieve better latency and CPU efficiency than software-based KBN, it also takes away the operator's control over network sharing policies.

    Providing policy support in multi-tenant hardware KBN brings unique challenges -- namely,
    preserving ultra-low latency and low CPU cost, finding a well-defined point of mediation, and rethinking traffic shapers.
    We present Justitia to address these challenges with three key design aspects:
    (i) Split Connection with message-level shaping,
    (ii) sender-based resource mediation together with receiver-side updates, and
    (iii) passive latency monitoring.
    Using a latency target as its knob, Justitia enables multi-tenancy policies such as predictable latencies and fair/weighted resource sharing.
    Our evaluation shows Justitia can effectively isolate latency-sensitive applications at the cost of slightly decreased utilization and ensure that throughput and bandwidth of the rest are not unfairly penalized.
  }
}

@InProceedings{aequitas:sigcomm22,
  author    = {Yiwen Zhang and Gautam Kumar and Nandita Dukkipati and Xian Wu and Priyaranjan Jha and Mosharaf Chowdhury and Amin Vahdat},
  booktitle = {ACM SIGCOMM},
  title     = {Aequitas: Admission Control for Performance-Critical {RPCs} in Datacenters},
  year      = {2022},

  publist_confkey = {SIGCOMM'22},
  publist_link = {paper || aequitas-sigcomm22.pdf},
  publist_link = {code || https://github.com/SymbioticLab/Aequitas},
  publist_topic = {Datacenter Networking},
  publist_topic = {Disaggregation},
  publist_badge = {Artifacts Available},
  publist_badge = {Artifacts Functional},
  publist_badge = {Results Reproduced},
  publist_abstract = {
    With the increasing popularity of disaggregated storage and microservice architectures, high fan-out and fan-in Remote Procedure Calls (RPCs) now generate most of the traffic in modern datacenters.
    While the network plays a crucial role in RPC performance, traditional traffic classification categories cannot sufficiently capture their importance due to wide variations in RPC characteristics.
    As a result, meeting service-level objectives (SLOs), especially for performance-critical ($PC$) RPCs, remains challenging.

    We present Aequitas, a distributed sender-driven admission control scheme that uses commodity Weighted-Fair Queuing (WFQ) to guarantee RPC-level SLOs.
    In the presence of network overloads, it enforces cluster-wide RPC latency SLOs by limiting the amount of traffic admitted into any given QoS and downgrading the rest.
    We show analytically and empirically that this simple scheme works well.
    When the network demand spikes beyond provisioned capacity, Aequitas achieves a latency SLO that is 3.8$\times$ lower than the state-of-art congestion control at the 99.9$^{th}$-$p$ and admits up to 2$\times$ more $PC$ RPCs meeting SLO when compared with pFabric, Qjump, D$^3$, PDQ, and Homa.
    Results in our fleetwide production deployment show a 10\% latency improvement.
  }
}

@InProceedings{vulcan:nsdi24,
  author    = {Yiwen Zhang and Xumiao Zhang and Ganesh Ananthanarayanan and Anand Iyer and Yuanchao Shu and Victor Bahl and Z. Morley Mao and Mosharaf Chowdhury},
  booktitle = {USENIX NSDI},
  title     = {Vulcan: Automatic Query Planning for Live {ML} Analytics},
  year      = {2024},

  publist_confkey = {NSDI'24},
  publist_link = {paper || vulcan-nsdi24.pdf},
  publist_topic = {Systems + AI},
  publist_abstract = {
      Live ML analytics have gained increasing popularity with large-scale deployments due to recent evolution of ML technologies. To serve live ML queries, experts nowadays still need to perform manual query planning, which involves pipeline construction, query configuration, and pipeline placement across multiple edge tiers in a heterogeneous infrastructure. Finding the best query plan for a live ML query requires navigating a huge search space, calling for an efficient and systematic solution.

    In this paper, we propose Vulcan, a system that automatically generates query plans for live ML queries to optimize their accuracy, latency, and resource consumption. Based on the user query and performance requirements, Vulcan determines the best pipeline, placement, and query configuration for the query with low profiling cost; it also performs fast online adaptation after query deployment. Vulcan outperforms state-of-the-art ML analytics systems by 4.1$\times$-30.1$\times$ in terms of search cost while delivering up to 3.3$\times$ better query latency.
  }
}

@InProceedings{netlock:sigcomm20,
  author    = {Zhuolong Yu and Yiwen Zhang and Vladimir Braverman and Mosharaf Chowdhury and Xin Jin},
  booktitle = {ACM SIGCOMM},
  title     = {{NetLock}: Fast, Centralized Lock Management Using Programmable Switches},
  year      = {2020},
  pages     = {126--138},

  publist_confkey = {SIGCOMM'20},
  publist_link = {paper || netlock-sigcomm20.pdf},
  publist_link = {code || https://github.com/netx-repo/NetLock},
  publist_topic = {Datacenter Networking},
  publist_topic = {Disaggregation},
  publist_badge = {Artifacts Available},
  publist_badge = {Artifacts Functional},
  publist_badge = {Results Reproduced},
  publist_abstract = {

    Lock managers are widely used by distributed systems. Traditional centralized lock managers can easily support policies between multiple users using global knowledge, but they suffer from low performance. In contrast, emerging decentralized approaches are faster but cannot provide flexible policy support. Furthermore, performance in both cases is limited by the server capability.

    We present NetLock, a new centralized lock manager that co-designs servers and network switches to achieve high performance without sacrificing flexibility in policy support. The key idea of NetLock is to exploit the capability of emerging programmable switches to directly process lock requests in the switch data plane. Due to the limited switch memory, we design a memory management mechanism to seamlessly integrate the switch and server memory. To realize the locking functionality in the switch, we design a custom data plane module that efficiently pools multiple register arrays together to maximize memory utilization We have implemented a NetLock prototype with a Barefoot Tofino switch and a cluster of commodity servers. Evaluation results show that NetLock improves the throughput by 14.0-18.4x, and reduces the average and 99% latency by 4.7-20.3x and 10.4-18.7x over DSLR, a state-of-the-art RDMA-based solution, while providing flexible policy support.
  }
}

@InProceedings{leap:atc20,
  author    = {Hasan Al Maruf and Mosharaf Chowdhury},
  booktitle = {USENIX ATC},
  title     = {Effectively Prefetching Remote Memory with {Leap}},
  year      = {2020},
  pages     = {843--857},

  publist_confkey = {ATC'20},
  publist_link = {paper || leap-atc20.pdf},
  publist_link = {slides || leap-atc20-slides.pdf},
  publist_link = {code || https://github.com/SymbioticLab/Leap},
  publist_badge = {Best Paper Award},
  publist_topic = {Disaggregation},
  publist_abstract = {
    Memory disaggregation over RDMA can improve the performance of memory-constrained applications by replacing disk swapping with remote memory accesses. However, state-of-the-art memory disaggregation solutions still use data path components designed for slow disks. As a result, applications experience remote memory access latency significantly higher than that of the underlying low-latency network, which itself can be too high for many applications.

    In this paper, we propose Leap, a prefetching solution for remote memory accesses due to memory disaggregation. At its core, Leap employs an online, majority-based prefetching algorithm, which increases the page cache hit rate. We complement it with a lightweight and efficient data path in the kernel that isolates each application’s data path to the disaggregated memory and mitigates latency bottlenecks arising from legacy throughput-optimizing operations. Integration of Leap in the Linux kernel improves the median and tail remote page access latencies of memory-bound applications by up to 104.04× and 22.62×, respectively, over the default data path. This leads to up to 10.16× performance improvements for applications using disaggregated memory in comparison to the state-of-the-art solutions.
  }
}

@InProceedings{hydra:fast22,
  author    = {Youngmoon Lee and Hasan Al Maruf and Mosharaf Chowdhury and Asaf Cidon and Kang G. Shin},
  booktitle = {USENIX FAST},
  title     = {{Hydra} : Resilient and Highly Available Remote Memory},
  year      = {2022},

  publist_confkey = {FAST'22},
  publist_link = {paper || hydra-fast22.pdf},
  publist_link = {code || https://github.com/SymbioticLab/hydra},
  publist_topic = {Disaggregation},
  publist_abstract = {
    We present Hydra, a low-latency, low-overhead, and highly available resilience mechanism for remote memory. Hydra can access erasure-coded remote memory within a single-digit $\mu$s read/write latency, significantly improving the performance-efficiency tradeoff over the state-of-the-art -- it performs similar to in-memory replication with 1.6$\times$ lower memory overhead. We also propose CodingSets, a novel coding group placement algorithm for erasure-coded data, that provides load balancing while reducing the probability of data loss under correlated failures by an order of magnitude. With Hydra, even when only 50\% memory is local, unmodified memory-intensive applications achieve performance close to that of the fully in-memory case in the presence of remote failures and outperforms the state-of-the-art remote-memory solutions by up to 4.35$\times$.
  }
}

@InProceedings{tpp:asplos23,
  author    = {Hasan Al Maruf and Hao Wang and Abhishek Dhanotia and Johannes Weiner and Niket Agarwal and Pallab Bhattacharya and Chris Petersen and Mosharaf Chowdhury and Shobhit Kanaujia and Prakash Chauhan},
  booktitle = {ASPLOS},
  title     = {{TPP}: Transparent Page Placement for {CXL}-Enabled Tiered-Memory},
  year      = {2023},

  publist_confkey = {ASPLOS'23},
  publist_badge = {MICRO Top Picks Honorable Mention},
  publist_link = {paper || tpp-asplos23.pdf},
  publist_link = {slides || tpp-slides-asplos23.pdf},
  publist_link = {code || https://lwn.net/Articles/876993/},
  publist_topic = {Disaggregation},
  publist_abstract = {
    The increasing demand for memory in hyperscale applications has led to memory becoming a large portion of the overall datacenter spend. The emergence of coherent interfaces like CXL enables main memory expansion and offers an efficient solution to this problem. In such systems, the main memory can constitute different memory technologies with varied characteristics. In this paper, we characterize memory usage patterns of a wide range of datacenter applications across the server fleet of Meta. We, therefore, demonstrate the opportunities to offload colder pages to slower memory tiers for these applications. Without efficient memory management, however, such systems can significantly degrade performance.

We propose a novel OS-level application-transparent page placement mechanism (TPP) for CXL-enabled memory. TPP employs a lightweight mechanism to identify and place hot/cold pages to appropriate memory tiers. It enables a proactive page demotion from local memory to CXL-Memory. This technique ensures a memory headroom for new page allocations that are often related to request processing and tend to be short-lived and hot. At the same time, TPP can promptly promote performance-critical hot pages trapped in the slow CXL-Memory to the fast local memory, while minimizing both sampling overhead and unnecessary migrations. TPP works transparently without any application-specific knowledge and can be deployed globally as a kernel release.

We evaluate TPP with diverse memory-sensitive workloads in the production server fleet with early samples of new x86 CPUs with CXL 1.1 support. TPP makes a tiered memory system performant as an ideal baseline (<1% gap) that has all the memory in the local tier. It is 18% better than today’s Linux, and 5–17% better than existing solutions including NUMA Balancing and AutoTiering. Most of the TPP patches have been merged in the Linux v5.18 release while the remaining ones are just pending for more discussion.
  }
}

@InProceedings{memtrade:sigmetrics23,
  author        = {Hasan Al Maruf and Yuhong Zhong and Hongyi Wang and Mosharaf Chowdhury and Asaf Cidon and Carl Waldspurger},
  booktitle 	= {SIGMETRICS},
  title         = {Memtrade: Marketplace for Disaggregated Memory Clouds},
  year          = {2023},
  publist_confkey = {SIGMETRICS'23},
  publist_link = {paper || memtrade-sigmetrics23.pdf},
  publist_link = {code || https://github.com/SymbioticLab/Memtrade},
  publist_topic = {Disaggregation},
  publist_abstract = {
    We present Memtrade, the first practical marketplace for disaggregated memory clouds. Clouds introduce a set of unique challenges for resource disaggregation across different tenants, including resource harvesting, isolation, and matching. Memtrade allows producer virtual machines (VMs) to lease both their unallocated memory and allocated-but-idle application memory to remote consumer VMs for a limited period of time. Memtrade does not require any modifications to host-level system software or support from the cloud provider. It harvests producer memory using an application-aware control loop to form a distributed transient remote memory pool with minimal performance impact; it employs a broker to match producers with consumers while satisfying performance constraints; and it exposes the matched memory to consumers through different abstractions. As a proof of concept, we propose two such memory access interfaces for Memtrade consumers -- a transient KV cache for specified applications and a swap interface that is application-transparent. Our evaluation using real-world cluster traces shows that Memtrade provides significant performance benefit for consumers (improving average read latency up to 2.8X) while preserving confidentiality and integrity, with little impact on producer applications (degrading performance by less than 2.1%).
  }
}

@article{hsct-survey,
  title={A Systematic Review of Machine Learning Techniques in Hematopoietic Stem Cell Transplantation ({HSCT})},
  author={Gupta, Vibhuti and Braun, Thomas M and Chowdhury, Mosharaf and Tewari, Muneesh and Choi, Sung Won},
  journal={Sensors},
  volume={20},
  number={21},
  pages={6100},
  year={2020},
  publisher={Multidisciplinary Digital Publishing Institute},
  publist_confkey = {Sensors:20(21)},
  publist_link = {paper || https://www.mdpi.com/1424-8220/20/21/6100},
  publist_abstract = {
    Machine learning techniques are widely used nowadays in the healthcare domain for the diagnosis, prognosis, and treatment of diseases. These techniques have applications in the field of hematopoietic cell transplantation (HCT), which is a potentially curative therapy for hematological malignancies. Herein, a systematic review of the application of machine learning (ML) techniques in the HCT setting was conducted. We examined the type of data streams included, specific ML techniques used, and type of clinical outcomes measured. A systematic review of English articles using PubMed, Scopus, Web of Science, and IEEE Xplore databases was performed. Search terms included "hematopoietic cell transplantation (HCT)," "autologous HCT," "allogeneic HCT," "machine learning," and "artificial intelligence." Only full-text studies reported between January 2015 and July 2020 were included. Data were extracted by two authors using predefined data fields. Following PRISMA guidelines, a total of 242 studies were identified, of which 27 studies met the inclusion criteria. These studies were sub-categorized into three broad topics and the type of ML techniques used included ensemble learning (63%), regression (44%), Bayesian learning (30%), and support vector machine (30%). The majority of studies examined models to predict HCT outcomes (e.g., survival, relapse, graft-versus-host disease). Clinical and genetic data were the most commonly used predictors in the modeling process. Overall, this review provided a systematic review of ML techniques applied in the context of HCT. The evidence is not sufficiently robust to determine the optimal ML technique to use in the HCT setting and/or what minimal data variables are required.
  }
}

@InProceedings{kayak:nsdi21,
  author    = {Jie You and Jingfeng Wu and Xin Jin and Mosharaf Chowdhury},
  booktitle = {USENIX NSDI},
  title     = {Ship Compute or Ship Data? Why Not Both?},
  year      = {2021},

  publist_confkey = {NSDI'21},
  publist_link = {paper || kayak-nsdi21.pdf},
  publist_link = {code || https://github.com/SymbioticLab/Kayak},
  publist_topic = {Disaggregation},
  publist_abstract = {
    How cloud applications should interact with their data re-mains an active area of research. Over the last decade, manyhave suggested relying on a key-value (KV) interface to inter-act with data stored in remote storage servers, while othershave vouched for the benefits of using remote procedure call(RPC). Instead of choosing one over the other, in this paper,we observe that an ideal solution must adaptively combineboth of them in order to maximize throughput while meetingapplication latency requirements. To this end, we proposea new system called Kayak that proactively adjusts the rateof requests and the fraction of requests to be executed usingRPC or KV, all in a fully decentralized and self-regulated man-ner. We theoretically prove that Kayak can quickly convergeto the optimal parameters.  We implement a system proto-type of Kayak. Our evaluations show that Kayak achievessub-second convergence  and improves  overall throughputby 32.5%-63.4% for compute-intensive workloads and upto 12.2% for non-compute-intensive and transactional work-loads over the state-of-the-art.
  }
}

@article{fedscale:arxiv21,
  title={FedScale: Benchmarking Model and System Performance of Federated Learning},
  author={Lai, Fan and Dai, Yinwei and Zhu, Xiangfeng and Chowdhury, Mosharaf},
  archiveprefix = {arXiv},
  eprint= {2105.11367},
  url= {https://arxiv.org/abs/2105.11367},
  year={2021},
  month={May},
  publist_confkey = {arXiv:2105.11367},
  publist_link = {paper || https://arxiv.org/abs/2105.11367},
  publist_link = {code || https://github.com/SymbioticLab/FedScale},
  publist_link = {website || https://fedscale.ai/},
  publist_topic = {Wide-Area Computing},
  publist_topic = {Systems + AI},
  publist_abstract = {
    We present FedScale, a diverse set of challenging and realistic benchmark datasets to facilitate scalable, comprehensive, and reproducible
    federated learning (FL) research. FedScale datasets are large-scale, encompassing a diverse range of important FL tasks, such as image
    classification, object detection, language modeling, speech recognition, and reinforcement learning. For each dataset, we provide a unified
    evaluation protocol using realistic data splits and evaluation metrics. To meet the pressing need for reproducing realistic FL at scale,

    we have also built an efficient evaluation platform to simplify and standardize the process of FL experimental setup and model evaluation.
    Our evaluation platform provides flexible APIs to implement new FL algorithms and include new execution backends with minimal developer
    efforts. Finally, we perform indepth benchmark experiments on these datasets. Our experiments suggest that FedScale presents significant
    challenges of heterogeneity-aware co-optimizations of the system and statistical efficiency under realistic FL characteristics,
    indicating fruitful opportunities for future research. FedScale is open-source with permissive licenses and actively maintained, and
    we welcome feedback and contributions from the community.
  }
}

@article{fed-ensemble:arxiv21,
  title={Fed-ensemble: Improving Generalization through Model Ensembling in Federated Learning},
  author={Naichen Shi and Fan Lai and Raed Al Kontar and Mosharaf Chowdhury},
  archiveprefix = {arXiv},
  eprint= {2107.10663},
  url= {https://arxiv.org/abs/2107.10663},
  year={2021},
  month={July},
  publist_confkey = {arXiv:2107.10663},
  publist_link = {paper || https://arxiv.org/abs/2107.10663},
  publist_topic = {Wide-Area Computing},
  publist_abstract = {
    In this paper we propose Fed-ensemble: a simple approach that brings model ensembling to
    federated learning (FL). Instead of aggregating local models to update a single global model, Fed-ensemble
    uses random permutations to update a group of K models and then obtains predictions
    through model averaging. Fed-ensemble can be readily utilized within established FL methods and
    does not impose a computational overhead as it only requires one of the K models to be sent to a
    client in each communication round. Theoretically, we show that predictions on new data from all K
    models belong to the same predictive posterior distribution under a neural tangent kernel regime.
    This result in turn sheds light on the generalization advantages of model averaging. We also illustrate
    that Fed-ensemble has an elegant Bayesian interpretation. Empirical results show that our model
    has superior performance over several FL algorithms, on a wide range of data sets, and excels in
    heterogeneous settings often encountered in FL applications.
  }
}

@article{ioft-survey:arxiv21,
  title={{The Internet of Federated Things (IoFT)}: A Vision for the Future and In-Depth Survey of Data-Driven Approaches for Federated Learning},
  author={Raed Kontar and Naichen Shi and Xubo Yue and Seokhyun Chung and Eunshin Byon and Mosharaf Chowdhury and Judy Jin and Wissam Kontar and Neda Masoud and Maher Noueihed and Chinedum E Okwudire and Garvesh Raskutti and Romesh Saigal and Karandeep Singh and Zhisheng Ye},
  archiveprefix = {arXiv},
  eprint= {2111.05326},
  url= {https://arxiv.org/abs/2111.05326},
  year={2021},
  month={November},
  publist_confkey = {arXiv:2111.05326},
  publist_link = {paper || https://arxiv.org/abs/2111.05326},
  publist_topic = {Wide-Area Computing},
  publist_abstract = {
    The Internet of Things (IoT) is on the verge of a major paradigm shift. In the IoT system of the future, IoFT, the cloud will be substituted by the crowd where model training is brought to the edge, allowing IoT devices to collaboratively extract knowledge and build smart analytics/models while keeping their personal data stored locally. This paradigm shift was set into motion by the tremendous increase in computational power on IoT devices and the recent advances in decentralized and privacy-preserving model training, coined as federated learning (FL). This article provides a vision for IoFT and a systematic overview of current efforts towards realizing this vision. Specifically, we first introduce the defining characteristics of IoFT and discuss FL data-driven approaches, opportunities, and challenges that allow decentralized inference within three dimensions: (i) a global model that maximizes utility across all IoT devices, (ii) a personalized model that borrows strengths across all devices yet retains its own model, (iii) a meta-learning model that quickly adapts to new devices or learning tasks. We end by describing the vision and challenges of IoFT in reshaping different industries through the lens of domain experts. Those industries include manufacturing, transportation, energy, healthcare, quality & reliability, business, and computing.
  }
}

@article{ioft-survey:ieee-access,
  title={The {I}nternet of {F}ederated {T}hings ({IoFT})},
  author={Raed Kontar and Naichen Shi and Xubo Yue and Seokhyun Chung and Eunshin Byon and Mosharaf Chowdhury and Judy Jin and Wissam Kontar and Neda Masoud and Maher Noueihed and Chinedum E Okwudire and Garvesh Raskutti and Romesh Saigal and Karandeep Singh and Zhisheng Ye},
  journal={IEEE Access},
  volume={9},
  pages={156071--156113},
  year={2021},
  publisher={IEEE},
  publist_confkey = {IEEE Access:9},
  publist_link = {paper || https://doi.org/10.1109/ACCESS.2021.3127448},
  publist_badge = {Featured Article},
  publist_topic = {Wide-Area Computing},
  publist_abstract = {
    The Internet of Things (IoT) is on the verge of a major paradigm shift. In the IoT system of the future, IoFT, the cloud will be substituted by the crowd where model training is brought to the edge, allowing IoT devices to collaboratively extract knowledge and build smart analytics/models while keeping their personal data stored locally. This paradigm shift was set into motion by the tremendous increase in computational power on IoT devices and the recent advances in decentralized and privacy-preserving model training, coined as federated learning (FL). This article provides a vision for IoFT and a systematic overview of current efforts towards realizing this vision. Specifically, we first introduce the defining characteristics of IoFT and discuss FL data-driven approaches, opportunities, and challenges that allow decentralized inference within three dimensions: (i) a global model that maximizes utility across all IoT devices, (ii) a personalized model that borrows strengths across all devices yet retains its own model, (iii) a meta-learning model that quickly adapts to new devices or learning tasks. We end by describing the vision and challenges of IoFT in reshaping different industries through the lens of domain experts. Those industries include manufacturing, transportation, energy, healthcare, quality & reliability, business, and computing.
  }
}

@Article{covid-f20:jmirmh,
    author={Gilley, Kristen N and Baroudi, Loubna and Yu, Miao and Gainsburg, Izzy and Reddy, Niyanth and Bradley, Christina and Cislo, Christine and Rozwadowski, Michelle Lois and Clingan, Caroline Ashley and DeMoss, Matthew Stephen and Churay, Tracey and Birditt, Kira and Colabianchi, Natalie and Chowdhury, Mosharaf and Forger, Daniel and Gagnier, Joel and Zernicke, Ronald F and Cunningham, Julia Lee and Cain, Stephen M and Tewari, Muneesh and Choi, Sung Won},
    title={Risk Factors for {COVID-19} in College Students Identified by Physical, Mental, and Social Health Reported During the {Fall 2020} Semester: Observational Study Using the {Roadmap} App and {Fitbit} Wearable Sensors},
    journal={JMIR Mental Health},
    year={2022},
    month={Feb},
    day={10},
    volume={9},
    number={2},
    pages={e34645},
    publisher={JMIR Publications},
    publist_confkey = {JMIR-MH:9(2)},
    publist_link = {paper || https://doi.org/10.2196/34645},
    abstract={Background: The COVID-19 pandemic triggered a seismic shift in education to web-based learning. With nearly 20 million students enrolled in colleges across the United States, the long-simmering mental health crisis in college students was likely further exacerbated by the pandemic. Objective: This study leveraged mobile health (mHealth) technology and sought to (1) characterize self-reported outcomes of physical, mental, and social health by COVID-19 status; (2) assess physical activity through consumer-grade wearable sensors (Fitbit); and (3) identify risk factors associated with COVID-19 positivity in a population of college students prior to release of the vaccine. Methods: After completing a baseline assessment (ie, at Time 0 [T0]) of demographics, mental, and social health constructs through the Roadmap 2.0 app, participants were instructed to use the app freely, wear the Fitbit, and complete subsequent assessments at T1, T2, and T3, followed by a COVID-19 assessment of history and timing of COVID-19 testing and diagnosis (T4: {\textasciitilde}14 days after T3). Continuous measures were described using mean (SD) values, while categorical measures were summarized as n ({\%}) values. Formal comparisons were made on the basis of COVID-19 status. The multivariate model was determined by entering all statistically significant variables (P<.05) in univariable associations at once and then removing one variable at a time through backward selection until the optimal model was obtained. Results: During the fall 2020 semester, 1997 participants consented, enrolled, and met criteria for data analyses. There was a high prevalence of anxiety, as assessed by the State Trait Anxiety Index, with moderate and severe levels in 465 (24{\%}) and 970 (49{\%}) students, respectively. Approximately one-third of students reported having a mental health disorder (n=656, 33{\%}). The average daily steps recorded in this student population was approximately 6500 (mean 6474, SD 3371). Neither reported mental health nor step count were significant based on COVID-19 status (P=.52). Our analyses revealed significant associations of COVID-19 positivity with the use of marijuana and alcohol (P=.02 and P=.046, respectively) and with lower belief in public health measures (P=.003). In addition, graduate students were less likely and those with ≥20 roommates were more likely to report a COVID-19 diagnosis (P=.009). Conclusions: Mental health problems were common in this student population. Several factors, including substance use, were associated with the risk of COVID-19. These data highlight important areas for further attention, such as prioritizing innovative strategies that address health and well-being, considering the potential long-term effects of COVID-19 on college students. Trial Registration: ClinicalTrials.gov NCT04766788; https://clinicaltrials.gov/ct2/show/NCT04766788 International Registered Report Identifier (IRRID): RR2-10.2196/29561 },
    issn={2368-7959},
    doi={10.2196/34645}
}


@InProceedings{fluid:mlsys21,
  author    = {Peifeng Yu and Jiachen Liu and Mosharaf Chowdhury},
  booktitle = {MLSys},
  title     = {Fluid: Resource-Aware Hyperparameter Tuning Engine},
  year      = {2021},

  publist_confkey = {MLSys'21},
  publist_link = {paper || fluid-mlsys21.pdf},
  publist_link = {slides || fluid-mlsys21.pptx},
  publist_link = {poster || fluid-mlsys21-poster.pdf},
  publist_link = {code || https://github.com/SymbioticLab/fluid},
  publist_link = {video (oral) || https://www.youtube.com/watch?v=LZQE90whvfY},
  publist_link = {video (poster) || https://www.youtube.com/watch?v=yaBxEUxRXZI},
  publist_topic = {Systems + AI},
  publist_abstract = {
    Current hyperparameter tuning solutions lack complementary execution engines to efficiently leverage distributed
    computation, thus ignoring the possibility of intra- and inter-GPU sharing, which exhibits poor resource usage.
    In this paper, we present Fluid, a generalized hyperparameter tuning execution engine, that coordinates between
    hyperparameter tuning jobs and cluster resources. Fluid schedules evaluation trials in such jobs using a waterfilling
    approach to make the best use of resources both at intra- and inter-GPU granularities to speed up the tuning
    process. By abstracting a hyperparameter tuning job as a sequence of TrialGroup, Fluid can boost the performance
    of diverse hyperparameter tuning solutions. Our experiments show that Fluid can speed up synchronous BOHB by
    100\%, and BOHB and ASHA by 30\% while having similar final accuracy.
  }
}

@InProceedings{oort:osdi21,
  author    = {Fan Lai and Xiangfeng Zhu and Harsha V. Madhyastha and Mosharaf Chowdhury},
  booktitle = {USENIX OSDI},
  title     = {Oort: Efficient Federated Learning via Guided Participant Selection},
  year      = {2021},

  publist_confkey = {OSDI'21},
  publist_link = {paper || oort-osdi21.pdf},
  publist_link = {code || https://github.com/SymbioticLab/Oort},
  publist_badge = {Artifacts Available},
  publist_badge = {Artifacts Functional},
  publist_badge = {Results Reproduced},
  publist_badge = {Distinguished Artifact Award},
  publist_topic = {Wide-Area Computing},
  publist_topic = {Systems + AI},
  publist_abstract = {
    Federated Learning (FL) is an emerging direction in distributed machine learning (ML) that enables in-situ model training and testing on edge data. Despite having the same end goals as traditional ML, FL executions differ significantly in scale, spanning thousands to millions of participating devices. As a result, data characteristics and device capabilities vary widely across clients. Yet, existing efforts randomly select FL participants, which leads to poor model and system efficiency.
    In this paper, we propose Kuiper to improve the performance of federated training and testing with guided participant selection. With an aim to improve time-to-accuracy performance in model training, Kuiper prioritizes the use of those clients who have both data that offers the greatest utility in improving model accuracy and the capability to run training quickly. To enable FL developers to interpret their results in model testing, Kuiper enforces their requirements on the distribution of participant data while improving the duration of federated testing by cherry-picking clients. Our evaluation shows that, compared to existing participant selection mechanisms, Kuiper improves time-to-accuracy performance by 1.2×-14.1× and final model accuracy by 1.3%-9.8%, while efficiently enforcing developer-specified model testing criteria at the scale of millions of clients.
  }
}

@InProceedings{fedscale:resilientfl21,
  author    = {Fan Lai and Yinwei Dai and Xiangfeng Zhu and Harsha V. Madhyastha and Mosharaf Chowdhury},
  booktitle = {ACM SOSP ResilientFL},