cuda_OpticalFlowDual_TVL1 is not thread-safe in python

System information (version)
  • OpenCV => 4.3.0
  • Operating System / Platform => Ubuntu
  • Compiler => gcc
  • Cuda => 10.0
  • GPU => nvidai RTX2080ti
Detailed description

I run the same code twice and find cuda_OpticalFlowDual_TVL1 got different results when calculating optical flow using python multithreading.

The different parts of two optical flow result in the same video seem frame-based. Some results of frames in the same video are consistent and some not.

This problem does not appear when using single thread.

Steps to reproduce
import cv2
from threading import Thread
import numpy as np


def job(video_path):
    optical_flow = cv2.cuda_OpticalFlowDual_TVL1.create()
    video_capture = cv2.VideoCapture(video_path)
    _, prev_frame = video_capture.read()
    prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    output = []
    for i in range(10):
        _, current_frame = video_capture.read()
        current_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
        cuMat1 = cv2.cuda_GpuMat()
        cuMat2 = cv2.cuda_GpuMat()
        cuMat1.upload(prev_frame)
        cuMat2.upload(current_frame)
        cu_flow = optical_flow.calc(cuMat1, cuMat2, None)
        optical_flow_data = cu_flow.download()
        output.append(optical_flow_data)
        prev_frame = current_frame
    np.save('{}.npy'.format(video_path[:-4]), output)


if __name__ == '__main__':
    video_path_list = ['video1.avi',
                       'video2.avi',
                       'video3.avi']
    worker_list = []
    for i in range(3):
        t = Thread(target=job, kwargs={'video_path': video_path_list[i]})
        t.start()
        worker_list.append(t)
    for worker in worker_list:
        worker.join()
Issue submission checklist
  • I report the issue, it’s not a question
  • I checked the problem with documentation, FAQ, open issues,
    answers.opencv.org, Stack Overflow, etc and have not found solution
  • I updated to latest OpenCV version and the issue is still there
  • There is reproducer code and related data files: videos, images, onnx, etc

1 possible answer(s) on “cuda_OpticalFlowDual_TVL1 is not thread-safe in python

  1. @daniel-code
    I have slightly modified your test code. It compares the results from synchronous launches and asynchronous runs.
    You’ll have to use cv2.cuda_Stream() when you call optical_flow.calc().

    Test Code

    import cv2
    from threading import Thread, Lock
    import numpy as np
    
    def job(video_path, output):
        optical_flow = cv2.cuda_OpticalFlowDual_TVL1.create()
        video_capture = cv2.VideoCapture(video_path)
        _, prev_frame = video_capture.read()
        prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
        for i in range(10):
            _, current_frame = video_capture.read()
            current_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
    
            cuMat1 = cv2.cuda_GpuMat()
            cuMat2 = cv2.cuda_GpuMat()
            cuMat1.upload(prev_frame)
            cuMat2.upload(current_frame)
            cu_flow = optical_flow.calc(cuMat1, cuMat2, None, cv2.cuda_Stream())
            optical_flow_data = cu_flow.download()
    
            output.append(optical_flow_data)
            prev_frame = current_frame
    
    if __name__ == '__main__':
        video_path_list = ['E:/repos/opencv_extra/testdata/gpu/video/768x576.avi',
                           'E:/repos/opencv_extra/testdata/gpu/video/1920x1080.avi',
                           'E:/repos/opencv_extra/testdata/highgui/video/big_buck_bunny.mp4']
    
        # synchronous launch
        out0 = []
        out1 = []
        out2 = []
        
        job(video_path_list[0], out0)
        job(video_path_list[1], out1)
        job(video_path_list[2], out2)
        
        print('synchronous run complete')
        
        # asynchronous launch
        tout0 = []
        tout1 = []
        tout2 = []
    
        t1 = Thread(target=job, kwargs={'video_path': video_path_list[0], 'output': tout0})
        t1.start()
        t2 = Thread(target=job, kwargs={'video_path': video_path_list[1], 'output': tout1})
        t2.start()
        t3 = Thread(target=job, kwargs={'video_path': video_path_list[2], 'output': tout2})
        t3.start()
    
        t1.join()
        t2.join()
        t3.join()
        
        print('asynchronous run complete')
        
        # compare synchronous and asynchronous result
        print(np.array_equal(out0, tout0))
        print(np.array_equal(out1, tout1))
        print(np.array_equal(out2, tout2))
    Test Code in C++

    void helper(const string& path, vector<Mat>* out_vec)
    {
        cv::VideoCapture capture(path);
    
        cv::Mat _prev;
        capture.read(_prev);
    
        cv::cuda::HostMem prev(_prev.size(), CV_8UC1);
        cv::cuda::HostMem cur(_prev.size(), CV_8UC1);
    
        cv::cvtColor(_prev, prev.createMatHeader(), cv::COLOR_BGR2GRAY);
    
        auto alg = cv::cuda::OpticalFlowDual_TVL1::create();
        cv::cuda::Stream stream;
    
        cv::cuda::GpuMat d_prev;
        d_prev.upload(prev, stream);
    
        for (int i = 0; i < 90; ++i)
        {
            cv::Mat _cur;
            capture.read(_cur);
            cv::cvtColor(_cur, cur.createMatHeader(), cv::COLOR_BGR2GRAY);
    
            cv::cuda::GpuMat d_cur;
            d_cur.upload(cur, stream);
    
            cv::cuda::GpuMat d_out;
            alg->calc(d_prev, d_cur, d_out, stream);
    
            cv::cuda::HostMem out;
    
    		d_out.download(out, stream);
    
            stream.waitForCompletion();
    
            out_vec->push_back(out.createMatHeader().clone());
    
            d_prev = d_cur;
        }
    }
    
    TEST(OpticalFlowDual_TVL1_Issue, Issue18155)
    {
        vector<string> video_path_list;
        video_path_list.emplace_back("E:/repos/opencv_extra/testdata/gpu/video/768x576.avi");
        video_path_list.emplace_back("E:/repos/opencv_extra/testdata/gpu/video/1920x1080.avi");
        video_path_list.emplace_back("E:/repos/opencv_extra/testdata/highgui/video/big_buck_bunny.mp4");
    
        // synchronous run
    
        vector<Mat> t1_sync;
        vector<Mat> t2_sync;
        vector<Mat> t3_sync;
    
        auto start = std::chrono::high_resolution_clock::now();
        {
            helper(video_path_list[0], &t1_sync);
            helper(video_path_list[1], &t2_sync);
            helper(video_path_list[2], &t3_sync);
        }
        auto end = std::chrono::high_resolution_clock::now();
    
        cout << "Synchronous run complete (" << std::to_string(std::chrono::duration<float, std::milli>(end - start).count()) << " ms)" << std::endl;
    
        // asynchronous run
        vector<Mat> t1_async;
        vector<Mat> t2_async;
        vector<Mat> t3_async;
    
        start = std::chrono::high_resolution_clock::now();
        {
            std::thread thread1(helper, video_path_list[0], &t1_async);
            std::thread thread2(helper, video_path_list[1], &t2_async);
            std::thread thread3(helper, video_path_list[2], &t3_async);
    
            thread1.join();
            thread2.join();
            thread3.join();
        }
        end = std::chrono::high_resolution_clock::now();
    
        cout << "All threads complete (Asynchronous run complete) (" << std::to_string(std::chrono::duration<float, std::milli>(end - start).count()) << " ms)" << std::endl;
    
        std::cout << std::to_string(t1_sync.size()) << std::endl;
        std::cout << std::to_string(t2_sync.size()) << std::endl;
        std::cout << std::to_string(t3_sync.size()) << std::endl;
        std::cout << std::to_string(t1_async.size()) << std::endl;
        std::cout << std::to_string(t2_async.size()) << std::endl;
        std::cout << std::to_string(t3_async.size()) << std::endl;
    
        for (int i = 0; i < t1_sync.size(); ++i)
            EXPECT_MAT_NEAR(t1_sync[i], t1_async[i], 0.0);
        for (int i = 0; i < t2_sync.size(); ++i)
            EXPECT_MAT_NEAR(t2_sync[i], t2_async[i], 0.0);
        for (int i = 0; i < t3_sync.size(); ++i)
            EXPECT_MAT_NEAR(t3_sync[i], t3_async[i], 0.0);
    }