:github_url: https://github.com/pytorch/pytorch


.. _program_listing_file_c10_cuda_CUDAStream.h:

Program Listing for File CUDAStream.h
=====================================

|exhale_lsh| :ref:`Return to documentation for file <file_c10_cuda_CUDAStream.h>` (``c10/cuda/CUDAStream.h``)

.. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS

.. code-block:: cpp

   #pragma once
   
   #include <cuda_runtime_api.h>
   
   #include <c10/core/DeviceGuard.h>
   #include <c10/core/Stream.h>
   #include <c10/cuda/CUDAFunctions.h>
   #include <c10/util/Exception.h>
   
   /*
    * Stream pool note.
    *
    * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
    * are backed by cuStreams, but they use several pools to minimize the costs
    * associated with creating, retaining, and destroying cuStreams.
    *
    * There are three pools per device, and a device's pools are lazily created.
    *
    * The first pool contains only the default stream. When the default stream
    * is requested it's returned.
    *
    * The second pool is the "low priority" or "default priority" streams. In
    * HIP builds there is no distinction between streams in this pool and streams
    * in the third pool (below). There are 32 of these streams per device, and
    * when a stream is requested one of these streams is returned round-robin.
    * That is, the first stream requested is at index 0, the second at index 1...
    * to index 31, then index 0 again.
    *
    * This means that if 33 low priority streams are requested, the first and
    * last streams requested are actually the same stream (under the covers)
    * and kernels enqueued on them cannot run concurrently.
    *
    * The third pool is the "high priority" streams. The third pool acts like
    * the second pool except the streams are created with a higher priority.
    *
    * These pools suggest that stream users should prefer many short-lived streams,
    * as the cost of acquiring and releasing streams is effectively zero. If
    * many longer-lived streams are required in performance critical scenarios
    * then the functionality here may need to be extended to allow, for example,
    * "reserving" a subset of the pool so that other streams do not accidentally
    * overlap the performance critical streams.
    *
    * Note: although the notion of "current stream for device" is thread local
    * (every OS thread has a separate current stream, as one might expect),
    * the stream pool is global across all threads; stream 0 is always stream 0
    * no matter which thread you use it on.  Multiple threads can synchronize
    * on the same stream.  Although the CUDA documentation is not very clear
    * on the matter, streams are thread safe; e.g., it is safe to enqueue
    * a kernel on the same stream from two different threads.
    */
   
   namespace c10::cuda {
   
   static constexpr int max_compile_time_stream_priorities = 4;
   
   // Value object representing a CUDA stream.  This is just a wrapper
   // around c10::Stream, but it comes with a little extra CUDA-specific
   // functionality (conversion to cudaStream_t), and a guarantee that
   // the wrapped c10::Stream really is a CUDA stream.
   class C10_CUDA_API CUDAStream {
    public:
     enum Unchecked { UNCHECKED };
   
     explicit CUDAStream(Stream stream) : stream_(stream) {
       TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
     }
   
     explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}
   
     bool operator==(const CUDAStream& other) const noexcept {
       return unwrap() == other.unwrap();
     }
   
     bool operator!=(const CUDAStream& other) const noexcept {
       return unwrap() != other.unwrap();
     }
   
     operator cudaStream_t() const {
       return stream();
     }
   
     operator Stream() const {
       return unwrap();
     }
   
     DeviceType device_type() const {
       return DeviceType::CUDA;
     }
   
     DeviceIndex device_index() const {
       return stream_.device_index();
     }
   
     Device device() const {
       return Device(DeviceType::CUDA, device_index());
     }
   
     StreamId id() const {
       return stream_.id();
     }
   
     bool query() const;
   
     void synchronize() const;
   
     int priority() const {
       DeviceGuard guard{stream_.device()};
       int priority = 0;
       C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
       return priority;
     }
   
     cudaStream_t stream() const;
   
     Stream unwrap() const {
       return stream_;
     }
   
     struct c10::StreamData3 pack3() const {
       return stream_.pack3();
     }
   
     // Unpack a CUDAStream from the 3 fields generated by pack().
     static CUDAStream unpack3(
         StreamId stream_id,
         DeviceIndex device_index,
         DeviceType device_type) {
       return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
     }
   
     static std::tuple<int, int> priority_range() {
       // Note: this returns the range of priority **supported by PyTorch**, not
       // the range of priority **supported by CUDA**. The former is a subset of
       // the latter.
       int least_priority = 0, greatest_priority = 0;
       C10_CUDA_CHECK(
           cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
   #ifdef USE_ROCM
       // See Note [HIP stream priorities]
       TORCH_INTERNAL_ASSERT(
           least_priority == 1, "Unexpected HIP stream priority range");
       least_priority = 0;
   #else
       TORCH_INTERNAL_ASSERT(
           least_priority == 0, "Unexpected CUDA stream priority range");
   #endif
       TORCH_INTERNAL_ASSERT(
           greatest_priority <= -1, "Unexpected CUDA stream priority range");
       greatest_priority = std::max(
           -c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
       return std::make_tuple(least_priority, greatest_priority);
     }
   
     // Deleted for now; use CUDAEvent::block instead
     // void synchronize_with(const CUDAEvent& event) const;
   
    private:
     Stream stream_;
   };
   
   C10_API CUDAStream
   getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
   // no default priority to disambiguate overloads
   C10_API CUDAStream
   getStreamFromPool(const int priority, DeviceIndex device = -1);
   
   C10_API CUDAStream
   getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
   
   C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
   
   C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
   
   C10_API void setCurrentCUDAStream(CUDAStream stream);
   
   C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
   
   } // namespace c10::cuda
   
   // hipify v2 backward compat in external projects
   #ifdef USE_ROCM
   namespace c10::hip {
   using c10::cuda::getStreamFromExternal;
   using c10::cuda::getStreamFromPool;
   // must use inline wrappers instead of reference aliases due to default args
   inline c10::cuda::CUDAStream getDefaultHIPStream(
       DeviceIndex device_index = -1) {
     return c10::cuda::getDefaultCUDAStream(device_index);
   }
   inline c10::cuda::CUDAStream getCurrentHIPStream(
       DeviceIndex device_index = -1) {
     return c10::cuda::getCurrentCUDAStream(device_index);
   }
   inline auto& setCurrentHIPStream = c10::cuda::setCurrentCUDAStream;
   inline c10::cuda::CUDAStream getStreamFromPoolMasqueradingAsCUDA(
       const bool isHighPriority = false,
       DeviceIndex device = -1) {
     return c10::cuda::getStreamFromPool(isHighPriority, device);
   }
   inline c10::cuda::CUDAStream getStreamFromPoolMasqueradingAsCUDA(
       const int priority,
       DeviceIndex device = -1) {
     return c10::cuda::getStreamFromPool(priority, device);
   }
   inline auto& getStreamFromExternalMasqueradingAsCUDA =
       c10::cuda::getStreamFromExternal;
   inline c10::cuda::CUDAStream getDefaultHIPStreamMasqueradingAsCUDA(
       DeviceIndex device_index = -1) {
     return c10::cuda::getDefaultCUDAStream(device_index);
   }
   inline c10::cuda::CUDAStream getCurrentHIPStreamMasqueradingAsCUDA(
       DeviceIndex device_index = -1) {
     return c10::cuda::getCurrentCUDAStream(device_index);
   }
   inline auto& setCurrentHIPStreamMasqueradingAsCUDA =
       c10::cuda::setCurrentCUDAStream;
   } // namespace c10::hip
   #endif
   
   namespace std {
   template <>
   struct hash<c10::cuda::CUDAStream> {
     size_t operator()(c10::cuda::CUDAStream s) const noexcept {
       return std::hash<c10::Stream>{}(s.unwrap());
     }
   };
   } // namespace std