vkr.bib

% !TeX spellcheck = ru_RU

@inproceedings{fsharpasyncawait,
author = {Syme, Don and Petricek, Tomas and Lomov, Dmitry},
year = {2011},
month = {01},
pages = {175-189},
title = {The F\# Asynchronous Programming Model},
volume = {6539},
isbn = {978-3-642-18377-5},
doi = {10.1007/978-3-642-18378-2_15}
}

@inproceedings{LCRQNoCAS2,
author = {Romanov, Raed and Koval, Nikita},
title = {The State-of-the-Art LCRQ Concurrent Queue Algorithm Does NOT Require CAS2},
year = {2023},
isbn = {9798400700156},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3572848.3577485},
doi = {10.1145/3572848.3577485},
abstract = {Concurrent queues are, arguably, one of the most important data structures in high-load applications, which require them to be extremely fast and scalable. Achieving these properties is non-trivial. The early solutions, such as the classic queue by Michael and Scott, store elements in a concurrent linked list. Reputedly, this design is non-scalable and memory-inefficient. Modern solutions utilize the Fetch-and-Add instruction to improve the algorithm's scalability and store elements in arrays to reduce the memory pressure. One of the most famous and fast such algorithms is LCRQ. The main disadvantage of its design is that it relies on the atomic CAS2 instruction, which is unavailable in most modern programming languages, such as Java, Kotlin, or Go, let alone some architectures.This paper presents the LPRQ algorithm, a portable modification of the original LCRQ design that eliminates all CAS2 usages. In contrast, it performs the synchronization utilizing only the standard Compare-and-Swap and Fetch-and-Add atomic instructions. Our experiments show that LPRQ provides the same performance as the classic LCRQ algorithm, outrunning the fastest of the existing solutions that do not use CAS2 by up to 1.6\texttimes{}.},
booktitle = {Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming},
pages = {14–26},
numpages = {13},
keywords = {concurrent queue, lock-free, ring buffer},
location = {Montreal, QC, Canada},
series = {PPoPP '23}
}

@inproceedings{FastWorkDistribution,
author = {Kappes, Giorgos and Anastasiadis, Stergios V.},
title = {A lock-free relaxed concurrent queue for fast work distribution},
year = {2021},
isbn = {9781450382946},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3437801.3441583},
doi = {10.1145/3437801.3441583},
abstract = {The operation of modern systems requires the low latency and high throughput of producer-consumer communication over shared memory. In order to achieve fast communication at high concurrency, we define a relaxed ordering model that splits the queue operations into two stages, the sequential assignment to queue slots and their subsequent concurrent execution. Based on this model, we design and implement the linearizable and lock-free algorithm called Relaxed Concurrent Queue Single (RCQS). We experimentally show that RCQS achieves factors to orders of magnitude advantage over the state-of-the-art queue algorithms in operation latency and item transfer speed.},
booktitle = {Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
pages = {454–456},
numpages = {3},
keywords = {array-based queue, linearizable, lock-free},
location = {Virtual Event, Republic of Korea},
series = {PPoPP '21}
}

@article{FamilyRelaxedConcurrentQueues,
author = {Kappes, Giorgos and Anastasiadis, Stergios V.},
title = {A Family of Relaxed Concurrent Queues for Low-Latency Operations and Item Transfers},
year = {2022},
issue_date = {December 2022},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {9},
number = {4},
issn = {2329-4949},
url = {https://doi.org/10.1145/3565514},
doi = {10.1145/3565514},
abstract = {The producer-consumer communication over shared memory is a critical function of current scalable systems. Queues that provide low latency and high throughput on highly utilized systems can improve the overall performance perceived by the end users. In order to address this demand, we set as priority to achieve both high operation performance and item transfer speed. The Relaxed Concurrent Queues (RCQs) are a family of queues that we have designed and implemented for that purpose. Our key idea is a relaxed ordering model that splits the enqueue and dequeue operations into a stage of sequential assignment to a queue slot and a stage of concurrent execution across the slots. At each slot, we apply no order restrictions among the operations of the same type. We define several variants of the RCQ algorithms with respect to offered concurrency, required hardware instructions, supported operations, occupied memory space, and precondition handling. For specific RCQ algorithms, we provide pseudo-code definitions and reason about their correctness and progress properties. Additionally, we theoretically estimate and experimentally validate the worst-case distance between an RCQ algorithm and a strict first-in-first-out (FIFO) queue. We developed prototype implementations of the RCQ algorithms and experimentally compare them with several representative strict FIFO and relaxed data structures over a range of workload and system settings. The RCQS algorithm is a provably linearizable lock-free member of the RCQ family. We experimentally show that RCQS achieves factors to orders of magnitude advantage over the state-of-the-art strict or relaxed queue algorithms across several latency and throughput statistics of the queue operations and item transfers.},
journal = {ACM Trans. Parallel Comput.},
month = dec,
articleno = {16},
numpages = {37},
keywords = {relaxed ordering, array-based queues, linearizable, blocking, lock-free, data structures, Producer-consumer}
}

@inproceedings{ScalableConcurrentQueuesManyCoreArchitectures,
author = {Scogland, Thomas R.W. and Feng, Wu-chun},
title = {Design and Evaluation of Scalable Concurrent Queues for Many-Core Architectures},
year = {2015},
isbn = {9781450332484},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2668930.2688048},
doi = {10.1145/2668930.2688048},
abstract = {As core counts increase and as heterogeneity becomes more common in parallel computing, we face the prospect of programming hundreds or even thousands of concurrent threads in a single shared-memory system. At these scales, even highly-efficient concurrent algorithms and data structures can become bottlenecks, unless they are designed from the ground up with throughput as their primary goal.In this paper, we present three contributions: (1) a characterization of queue designs in terms of modern multi- and many-core architectures, (2) the design of a high-throughput, linearizable, blocking, concurrent FIFO queue for many-core architectures that avoids the bottlenecks and pitfalls common in modern queue designs, and (3) a thorough evaluation of concurrent queue throughput across CPU, GPU, and co-processor devices. Our evaluation shows that focusing on throughput, rather than progress guarantees, allows our queue to scale to as much as three orders of magnitude (1000x) faster than lock-free and combining queues on GPU platforms and two times (2x) faster on CPU devices. These results deliver critical insights into the design of data structures for highly concurrent systems: (1) progress guarantees do not guarantee scalability, and (2) allowing an algorithm to block can increase throughput.},
booktitle = {Proceedings of the 6th ACM/SPEC International Conference on Performance Engineering},
pages = {63–74},
numpages = {12},
keywords = {accelerator, gpu, heterogeneous, many-core, queue},
location = {Austin, Texas, USA},
series = {ICPE '15}
}