与金属swift并行计算数组值的总和

发布时间：2020-12-14 02:27:58 所属栏目：百科来源：网络整理

导读：我试图与金属 swift并行计算大数组的总和. 有神的方法吗？我的平面是我将我的数组划分为子数组,并行计算一个子数组的总和,然后当并行计算完成时计运算符和的总和. 例如,如果我有 array = [a0,....an] 我在子数组中划分数组： array_1 = [a_0,...a_i],array_

我试图与金属 swift并行计算大数组的总和.

有神的方法吗？

我的平面是我将我的数组划分为子数组,并行计算一个子数组的总和,然后当并行计算完成时计运算符和的总和.

例如,如果我有

array = [a0,....an]

我在子数组中划分数组：

array_1 = [a_0,...a_i],array_2 = [a_i+1,...a_2i],....
array_n/i = [a_n-1,... a_n]

这个数组的总和是并行计算的,我得到了

sum_1,sum_2,sum_3,... sum_n/1

最后只计运算符和的总和.

我创建运行我的金属着色器的应用程序,但有些事情我不太了解.

var array:[[Float]] = [[1,2,3],[4,5,6],[7,8,9]]

        // get device
        let device: MTLDevice! = MTLCreateSystemDefaultDevice()

        // get library
        let defaultLibrary:MTLLibrary! = device.newDefaultLibrary()

        // queue
        let commandQueue:MTLCommandQueue! = device.newCommandQueue()

        // function
        let kernerFunction: MTLFunction! = defaultLibrary.newFunctionWithName("calculateSum")

        // pipeline with function
        let pipelineState: MTLComputePipelineState! = try device.newComputePipelineStateWithFunction(kernerFunction)

        // buffer for function
        let commandBuffer:MTLCommandBuffer! = commandQueue.commandBuffer()

        // encode function
        let commandEncoder:MTLComputeCommandEncoder = commandBuffer.computeCommandEncoder()

        // add function to encode
        commandEncoder.setComputePipelineState(pipelineState)

        // options
        let resourceOption = MTLResourceOptions()

        let arrayBiteLength = array.count * array[0].count * sizeofValue(array[0][0])

        let arrayBuffer = device.newBufferWithBytes(&array,length: arrayBiteLength,options: resourceOption)

        commandEncoder.setBuffer(arrayBuffer,offset: 0,atIndex: 0)

        var result:[Float] = [0,0]

        let resultBiteLenght = sizeofValue(result[0])

        let resultBuffer = device.newBufferWithBytes(&result,length: resultBiteLenght,options: resourceOption)

        commandEncoder.setBuffer(resultBuffer,atIndex: 1)

        let threadGroupSize = MTLSize(width: 1,height: 1,depth: 1)

        let threadGroups = MTLSize(width: (array.count),depth: 1)

        commandEncoder.dispatchThreadgroups(threadGroups,threadsPerThreadgroup: threadGroupSize)

        commandEncoder.endEncoding()

        commandBuffer.commit()

        commandBuffer.waitUntilCompleted()

        let data = NSData(bytesNoCopy: resultBuffer.contents(),length: sizeof(Float),freeWhenDone: false)

        data.getBytes(&result,length: result.count * sizeof(Float))

        print(result)

是我的Swift代码,

我的着色器是：

kernel void calculateSum(const device float *inFloat [[buffer(0)]],device float *result [[buffer(1)]],uint id [[ thread_position_in_grid ]]) {


    float * f = inFloat[id];
    float sum = 0;
    for (int i = 0 ; i < 3 ; ++i) {
        sum = sum + f[i];
    }

    result = sum;
}

我不知道如何定义inFloat是数组数组.
我不确切知道什么是threadGroupSize和threadGroups.
我不知道着色器属性中的设备和uint是什么.

这是正确的方法吗？

我花时间用Metal创建了这个问题的完整工作示例.解释在评论中：

import Metal

let count = 10_000_000
let elementsPerSum = 10_000

// Data type,has to be the same as in the shader
typealias DataType = CInt

let device = MTLCreateSystemDefaultDevice()!
let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
let pipeline = try! device.newComputePipelineStateWithFunction(parsum)

var data = (0..<count).map{ _ in DataType(arc4random_uniform(100)) } // Our data,randomly generated
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)

let dataBuffer = device.newBufferWithBytes(&data,length: strideof(DataType) * count,options: []) // Our data in a buffer (copied)
let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount,options: []) // A buffer for individual results (zero initialized)
let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()),count: resultsCount) // Our results in convenient form to compute the actual result later

let queue = device.newCommandQueue()
let cmds = queue.commandBuffer()
let encoder = cmds.computeCommandEncoder()

encoder.setComputePipelineState(pipeline)

encoder.setBuffer(dataBuffer,atIndex: 0)
encoder.setBytes(&dataCount,length: sizeofValue(dataCount),atIndex: 1)
encoder.setBuffer(resultsBuffer,atIndex: 2)
encoder.setBytes(&elementsPerSumC,length: sizeofValue(elementsPerSumC),atIndex: 3)

// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth,depth: 1)

// Here we set that each threadgroup should process `threadExecutionWidth` threads,the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth,depth: 1)

encoder.dispatchThreadgroups(threadgroupsPerGrid,threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()

var start,end : UInt64
var result : DataType = 0

start = mach_absolute_time()
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
    result += elem
}

end = mach_absolute_time()

print("Metal result: (result),time: (Double(end - start) / Double(NSEC_PER_SEC))")
result = 0

start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
    for elem in buffer {
        result += elem
    }
}
end = mach_absolute_time()

print("CPU result: (result),time: (Double(end - start) / Double(NSEC_PER_SEC))")

着色器：

// Data type,has to be the same as in the Swift file
typedef int DataType;

kernel void parsum(const device DataType* data [[ buffer(0) ]],const device uint& dataLength [[ buffer(1) ]],device DataType* sums [[ buffer(2) ]],const device uint& elementsPerSum [[ buffer(3) ]],const uint tgPos [[ threadgroup_position_in_grid ]],const uint tPerTg [[ threads_per_threadgroup ]],const uint tPos [[ thread_position_in_threadgroup ]]) {
    uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result,this var is unique to this thread
    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end

    for (; dataIndex < endIndex; dataIndex++)
        sums[resultIndex] += data[dataIndex];
}

我用我的Mac测试它,但它应该在iOS上运行得很好.

输出：

Metal result: 494936505,time: 0.024611456
CPU result: 494936505,time: 0.163341018

Metal版本的速度提高了约7倍.我敢肯定,如果你实施像截断或其他任何东西的分治,你可以获得更快的速度.

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!