Unity Compute Shader 활용 가이드

개요

Compute Shader는 렌더링 파이프라인 외부에서 GPU의 병렬 연산 능력을 활용하는 셰이더입니다. 파티클 시뮬레이션, 이미지 처리, 물리 계산, AI 연산 등 CPU로 처리하기 부담스러운 병렬 작업을 GPU에 오프로드할 수 있습니다.

1. 기본 Compute Shader 구조

#pragma kernel CSMain  // 커널 함수 선언

// 읽기/쓰기 버퍼
RWStructuredBuffer<float4> Result;

// 스레드 그룹 크기 정의 (x, y, z)
[numthreads(8, 8, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    // id.xy는 현재 스레드의 전역 인덱스
    float2 uv = float2(id.x, id.y) / 512.0;
    Result[id.y * 512 + id.x] = float4(uv.x, uv.y, 0.0, 1.0);
}

2. C#에서 Compute Shader 실행

using UnityEngine;

public class ComputeExample : MonoBehaviour
{
    [SerializeField] private ComputeShader computeShader;
    [SerializeField] private Material displayMaterial;

    private RenderTexture _renderTexture;
    private const int TextureSize = 512;

    void Start()
    {
        // RenderTexture 생성 (쓰기 가능하도록 enableRandomWrite)
        _renderTexture = new RenderTexture(TextureSize, TextureSize, 0,
            RenderTextureFormat.ARGBFloat);
        _renderTexture.enableRandomWrite = true;
        _renderTexture.Create();

        RunCompute();
    }

    void RunCompute()
    {
        int kernelHandle = computeShader.FindKernel("CSMain");

        // 텍스처를 커널에 바인딩
        computeShader.SetTexture(kernelHandle, "Result", _renderTexture);
        computeShader.SetInt("TextureSize", TextureSize);

        // 디스패치: TextureSize/8 × TextureSize/8 × 1 스레드 그룹
        int groups = TextureSize / 8;
        computeShader.Dispatch(kernelHandle, groups, groups, 1);

        // 결과 텍스처를 머터리얼에 적용
        displayMaterial.mainTexture = _renderTexture;
    }

    void OnDestroy()
    {
        _renderTexture?.Release();
    }
}

3. ComputeBuffer — 구조체 데이터 전달

struct Particle
{
    float3 position;
    float3 velocity;
    float  lifetime;
    float  pad;      // 16바이트 정렬을 위한 패딩
};

RWStructuredBuffer<Particle> Particles;
float DeltaTime;

[numthreads(64, 1, 1)]
void UpdateParticles(uint3 id : SV_DispatchThreadID)
{
    uint i = id.x;
    Particles[i].position += Particles[i].velocity * DeltaTime;
    Particles[i].lifetime -= DeltaTime;

    // 수명이 다하면 재설정
    if (Particles[i].lifetime <= 0.0)
    {
        Particles[i].position = float3(0, 0, 0);
        Particles[i].velocity = float3(
            sin(i * 1.234) * 2.0,
            cos(i * 2.345) * 2.0,
            0.0
        );
        Particles[i].lifetime = 3.0 + frac(i * 0.1) * 2.0;
    }
}

using UnityEngine;

public class ParticleSimulation : MonoBehaviour
{
    [SerializeField] private ComputeShader computeShader;
    [SerializeField] private Mesh particleMesh;
    [SerializeField] private Material particleMaterial;

    private const int ParticleCount = 100000;
    private ComputeBuffer _particleBuffer;
    private int _kernelHandle;

    struct Particle
    {
        public Vector3 position;
        public Vector3 velocity;
        public float lifetime;
        public float pad;
    }

    void Start()
    {
        _kernelHandle = computeShader.FindKernel("UpdateParticles");

        // stride = 구조체 크기 (바이트)
        int stride = System.Runtime.InteropServices.Marshal.SizeOf<Particle>();
        _particleBuffer = new ComputeBuffer(ParticleCount, stride);

        // 초기 데이터 설정
        var particles = new Particle[ParticleCount];
        var rng = new System.Random(42);
        for (int i = 0; i < ParticleCount; i++)
        {
            particles[i] = new Particle
            {
                position = Vector3.zero,
                velocity = new Vector3(
                    (float)(rng.NextDouble() - 0.5) * 4f,
                    (float)(rng.NextDouble() - 0.5) * 4f,
                    0f
                ),
                lifetime = (float)(rng.NextDouble() * 3f + 1f)
            };
        }
        _particleBuffer.SetData(particles);

        // 셰이더에 버퍼 바인딩
        computeShader.SetBuffer(_kernelHandle, "Particles", _particleBuffer);
        particleMaterial.SetBuffer("Particles", _particleBuffer);
    }

    void Update()
    {
        computeShader.SetFloat("DeltaTime", Time.deltaTime);

        // 64 스레드 그룹 × ParticleCount/64
        computeShader.Dispatch(_kernelHandle, ParticleCount / 64, 1, 1);

        // GPU Instancing으로 파티클 렌더링
        Graphics.DrawMeshInstancedProcedural(
            particleMesh, 0, particleMaterial,
            new Bounds(Vector3.zero, Vector3.one * 100f),
            ParticleCount
        );
    }

    void OnDestroy()
    {
        _particleBuffer?.Release();
    }
}

4. 스레드 그룹 설계

// 1D 작업: 64스레드 × N그룹
[numthreads(64, 1, 1)]
void Process1D(uint3 id : SV_DispatchThreadID)
{
    uint index = id.x;
    // Dispatch: (N / 64, 1, 1)
}

// 2D 작업: 8×8스레드 × (W/8 × H/8) 그룹
[numthreads(8, 8, 1)]
void Process2D(uint3 id : SV_DispatchThreadID)
{
    uint2 coord = id.xy;
    // Dispatch: (W/8, H/8, 1)
}

// 3D 작업: 4×4×4스레드
[numthreads(4, 4, 4)]
void Process3D(uint3 id : SV_DispatchThreadID)
{
    uint3 voxel = id;
    // Dispatch: (W/4, H/4, D/4)
}

5. GPU-CPU 데이터 읽기 (비동기)

// GPU 결과를 CPU로 읽기 (동기 방식 - 스톨 발생)
void ReadSync()
{
    var result = new Particle[ParticleCount];
    _particleBuffer.GetData(result); // GPU 완료 대기 (비용 큼)
    Debug.Log($"First particle: {result[0].position}");
}

// 비동기 읽기 (GPU 렌더링 차단 최소화)
private AsyncGPUReadbackRequest _readbackRequest;

void ReadAsync()
{
    _readbackRequest = AsyncGPUReadback.Request(_particleBuffer,
        (AsyncGPUReadbackRequest req) =>
        {
            if (req.hasError) { Debug.LogError("GPU readback error"); return; }
            var data = req.GetData<Particle>();
            Debug.Log($"First particle: {data[0].position}");
        });
}

6. AppendStructuredBuffer — 동적 추가

AppendStructuredBuffer<Particle> AliveParticles;
StructuredBuffer<Particle> AllParticles;

[numthreads(64, 1, 1)]
void CullParticles(uint3 id : SV_DispatchThreadID)
{
    Particle p = AllParticles[id.x];
    if (p.lifetime > 0.0)
    {
        AliveParticles.Append(p); // 살아있는 파티클만 추가
    }
}

// ComputeBuffer with append 타입
_appendBuffer = new ComputeBuffer(ParticleCount,
    stride, ComputeBufferType.Append);
_appendBuffer.SetCounterValue(0); // 카운터 초기화

computeShader.Dispatch(kernelHandle, ParticleCount / 64, 1, 1);

// 추가된 파티클 수 읽기
ComputeBuffer countBuffer = new ComputeBuffer(1,
    sizeof(int), ComputeBufferType.Raw);
ComputeBuffer.CopyCount(_appendBuffer, countBuffer, 0);
int[] count = new int[1];
countBuffer.GetData(count);
Debug.Log($"Alive particles: {count[0]}");
countBuffer.Release();

7. ComputeBuffer 타입 비교

타입	`ComputeBufferType`	용도
Structured	`Default`	구조체 배열 (가장 일반적)
Append/Consume	`Append`	동적 추가 (`AliveParticles.Append(p)`)
Indirect	`IndirectArguments`	`DrawMeshInstancedIndirect` 인수
Raw	`Raw`	`ByteAddressBuffer` — 바이트 단위 접근
Counter	`Counter`	원자적 카운터 (`IncrementCounter()`)

8. DispatchIndirect — GPU에서 디스패치 크기 결정

CPU 없이 GPU가 직접 몇 개의 스레드 그룹을 실행할지 결정할 때 사용합니다.

// Compute Shader (.compute)
// AppendStructuredBuffer에 쌓인 살아있는 파티클 수만큼만 디스패치

// C#
ComputeBuffer indirectArgs = new ComputeBuffer(3, sizeof(uint),
    ComputeBufferType.IndirectArguments);

// 1단계: CopyCount로 살아있는 파티클 수를 간접 인수 버퍼에 복사
ComputeBuffer.CopyCount(_aliveBuffer, indirectArgs, 0);

// 2단계: 인수 버퍼 기준으로 디스패치 — CPU 읽기 없음
computeShader.DispatchIndirect(
    computeShader.FindKernel("ProcessAlive"),
    indirectArgs,
    argsOffset: 0);

indirectArgs.Release();

DispatchIndirect는 CPU-GPU 동기화 없이 GPU가 다음 단계 작업량을 스스로 결정하는 GPU-Driven 렌더링의 핵심 패턴입니다.

정리

Compute Shader는 수만~수백만 개의 데이터를 병렬로 처리해야 할 때 CPU 대비 극적인 성능 향상을 제공합니다. 스레드 그룹 크기는 일반적으로 64(1D) 또는 8×8(2D)이 권장되며, GPU-CPU 데이터 동기화는 AsyncGPUReadback을 사용하여 렌더링 스톨을 방지해야 합니다. 버퍼는 항상 OnDestroy에서 Release()를 호출하여 GPU 메모리를 해제하세요.