Skip to content

Commit

Permalink
New features: DLISR and DLVFX (both required NV RTX GPU)
Browse files Browse the repository at this point in the history
DLISR: NVidia's deep learning based image super resolution filter.
DLVFX: NVidia's deep learning based video effect filters.
 - op=0: artefact reduction
 - op=1: super resolution
 - op=2: denoising (not working yet)
  • Loading branch information
AkarinVS committed Aug 1, 2021
1 parent f124cb6 commit 85a4077
Show file tree
Hide file tree
Showing 20 changed files with 2,972 additions and 0 deletions.
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,32 @@
Plugin akarin
=============

DLVFX
-----
`akarin.DLVFX(clip clip, int op[, float scale=1, float strength=0])`

There are three operation modes:
- `op=0`: artefact reduction. `int strength` controls the strength.
- `op=1`: super resolution, `scale>1` controls the scale factor. `int strength` controls the enhancement strength.
- `op=2`: denoising. `float strength` controls the strength. (Not working.)

Only 32-bit floating point RGB clips are supported for now.

This filter requires appropriate [Video Effects library (v0.6 beta)](https://www.nvidia.com/en-us/geforce/broadcasting/broadcast-sdk/resources/) to be instaled. (This library is too large to be bundled with the plugin.)
This filter also requires RTX-capable Nvidia GPU to run.

DLISR
-----

`akarin.DLISR(clip clip, [, int scale=2])`

This filter will use Nvidia [NGX Technology](https://developer.nvidia.com/rtx/ngx) DLISR DNN to scale up an input clip.
Input clip must be in `vs.RGBS` format.
The `scale` parameter can only be 2/4/8 and note that this filter uses considerable amount of GPU memory (e.g. 2GB for 2x scaling 1080p input)

This filter requires `nvngx_dlisr.dll` to be present in the same directory as this plugin.
This filter requires RTX-capable Nvidia GPU to run.

Expr
----

Expand Down
15 changes: 15 additions & 0 deletions ngx/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
To build a standalone plugin:
```
g++ -o akarin2.dll -shared -gdb ngx*.cc -I. -I ../include -static -DSTANDALONE_NGX
```

To use the plugin:
Rename the patched `nvngx_dlisr.dll` file as `akarin2.dlisr.dll`, and put in the
same directory as `akarin2.dll`:

```
core.std.LoadPlugin(r'/absolute/path/to/akarin2.dll')
c = core.std.BlankClip(format=vs.RGBS) # only support RGBS clips
res = core.akarin2.DLISR(c, scale=2)
res.set_output()
```
41 changes: 41 additions & 0 deletions ngx/autodll.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#ifndef AUTODLL_H
#define AUTODLL_H

#include <string>
#include <vector>
#ifdef _WIN32
#include <windows.h>
#else
#include <dlfcn.h>
#endif

template<typename T>
struct importDll {
#ifdef _WIN32
importDll(T &x, const wchar_t *dllName, const char *displayname, std::vector<std::string> &errors, const char *name) {
HMODULE h = LoadLibraryW(dllName);
#else
importDll(T &x, const char *dllName, const char *name, std::vector<std::string> &errors) {
const char *displayname = dllName;
void *h = dlopen(dllName, RTLD_GLOBAL | RTLD_LAZY);
#endif
if (h == 0) {
errors.push_back(std::string("unable to load ") + displayname);
return;
}
#ifdef _WIN32
x = reinterpret_cast<T>(GetProcAddress(h, name));
#else
x = reinterpret_cast<T>(dlsym(h, name));
#endif
if (x == nullptr) {
errors.push_back(std::string("unable to find ") + name + " in " + displayname);
}
}
};

#define EXT_FN(dll, retty, name, args) \
static retty (*name) args = nullptr; \
static importDll<retty (*)args> _load ## name(name, dll, #name)

#endif
259 changes: 259 additions & 0 deletions ngx/cuda.h

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions ngx/internalfilters.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef NGX_INTERNALFILTERS_H
#define NGX_INTERNALFILTERS_H

#include "VapourSynth.h"

void VS_CC ngxInitialize(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin);

#endif // INTERNALFILTERS_H
280 changes: 280 additions & 0 deletions ngx/ngx.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
#include <memory>
#include <mutex>
#include <utility>
#include <string>
#include <stdexcept>
#include <vector>
#include <stdio.h>

#include "VapourSynth.h"
#include "VSHelper.h"

#ifndef _WIN32
#error "Unsupported platform"
#else
static std::vector<std::string> autoDllErrors;
static const wchar_t *dllPath(const wchar_t *suffix);
#define NGX_DLL dllPath(L".dlisr.dll"),".dlisr.dll",autoDllErrors
#define CUDA_DLL L"nvcuda.dll","nvcuda.dll",autoDllErrors
#endif
#include "cuda.h"
#include "ngx.h"

#define CK_NGX(x) do { \
int r = (x); \
if (r != NVSDK_NGX_Result_Success) { \
fprintf(stderr, "failed NGX call %s: %x\n", #x, r); \
abort(); \
} \
} while (0)
#define CK_CUDA(x) do { \
int r = (x); \
if (r != CUDA_SUCCESS) { \
fprintf(stderr, "failed cuda call %s: %d\n", #x, r); \
abort(); \
} \
} while (0)

static const wchar_t *dllPath(const wchar_t *suffix) {
static const std::wstring res = [suffix]() -> std::wstring {
HMODULE mod = 0;
if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllPath, &mod)) {
std::vector<wchar_t> buf;
size_t n = 0;
do {
buf.resize(buf.size() + MAX_PATH);
n = GetModuleFileNameW(mod, buf.data(), buf.size());
} while (n >= buf.size());
buf.resize(n);
std::wstring path(buf.begin(), buf.end() - 4);
path += suffix;
return path;
}
throw std::runtime_error("unable to locate myself");
}();
return res.c_str();
}

static void *cudaMalloc(size_t size) {
void *ptr = nullptr;
CK_CUDA(cuMemAlloc_v2(&ptr, size));
return ptr;
}

struct NgxData {
std::mutex lock;

VSNodeRef *node;
VSVideoInfo vi;
int scale;

typedef float T;
uint64_t pixel_size() const { return 3 * sizeof(T); }
uint64_t in_image_width() const { return vi.width / scale; }
uint64_t out_image_width() const { return vi.width; }
uint64_t in_image_height() const { return vi.height / scale; }
uint64_t out_image_height() const { return vi.height; }
uint64_t in_image_row_bytes() const { return pixel_size() * in_image_width(); }
uint64_t out_image_row_bytes() const { return pixel_size() * out_image_width(); }
uint64_t in_size() const { return in_image_height() * in_image_row_bytes(); }
uint64_t out_size() const { return out_image_height() * out_image_row_bytes(); }

NVSDK_NGX_Parameter *param;
NVSDK_NGX_Handle *DUHandle;
CUcontext ctx;

std::vector<uint8_t> in_host, out_host;
CUdeviceptr inp, outp;
void allocate() {
in_host.resize(in_size());
out_host.resize(out_size());
inp = cudaMalloc(in_size());
outp = cudaMalloc(out_size());
}

NgxData() : node(nullptr), vi(), scale(0), param(nullptr), DUHandle(nullptr), ctx(nullptr), inp(nullptr), outp(nullptr) {}
~NgxData() {
if (ctx) {
CK_CUDA(cuCtxPushCurrent(ctx));
if (inp) CK_CUDA(cuMemFree_v2(inp));
if (outp) CK_CUDA(cuMemFree_v2(outp));
if (DUHandle) CK_NGX(NVSDK_NGX_CUDA_ReleaseFeature(DUHandle));
cuCtxPopCurrent(nullptr);
}
}
};

static void VS_CC ngxInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
NgxData *d = static_cast<NgxData *>(*instanceData);
vsapi->setVideoInfo(&d->vi, 1, node);
}

static const VSFrameRef *VS_CC ngxGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
NgxData *d = static_cast<NgxData *>(*instanceData);

if (activationReason == arInitial) {
vsapi->requestFrameFilter(n, d->node, frameCtx);
} else if (activationReason == arAllFramesReady) {
const VSFrameRef *src = vsapi->getFrameFilter(n, d->node, frameCtx);

const VSFormat *fi = d->vi.format;
assert(vsapi->getFrameHeight(src, 0) == (int)d->in_image_height());
assert(vsapi->getFrameWidth(src, 0) == (int)d->in_image_width());
int planes[3] = { 0, 1, 2 };
const VSFrameRef *srcf[3] = { nullptr, nullptr, nullptr };
VSFrameRef *dst = vsapi->newVideoFrame2(fi, d->out_image_width(), d->out_image_height(), srcf, planes, src, core);

// The NGX API is not thread safe.
std::lock_guard<std::mutex> lock(d->lock);
CK_CUDA(cuCtxPushCurrent(d->ctx));

auto params = d->param;
params->Set(NVSDK_NGX_Parameter_Width, (uint64_t)d->in_image_width());
params->Set(NVSDK_NGX_Parameter_Height, (uint64_t)d->in_image_height());
params->Set(NVSDK_NGX_Parameter_Scale, d->scale);

// Create the feature
//CK_NGX(NVSDK_NGX_CUDA_CreateFeature(NVSDK_NGX_Feature_ImageSuperResolution, params, &d->DUHandle));

void *in_image_dev_ptr = d->inp;
void *out_image_dev_ptr = d->outp;;

uint8_t *host = d->in_host.data();
typedef float T;
const T factor = 255.0f;
for (int plane = 0; plane < 3; plane++) {
const size_t stride = vsapi->getStride(src, plane);
const uint8_t *ptr = (uint8_t*)vsapi->getReadPtr(src, plane);
for (size_t i = 0; i < d->in_image_height(); i++)
for (size_t j = 0; j < d->in_image_width(); j++)
*(T*)&host[i * d->in_image_row_bytes() + j * d->pixel_size() + plane * sizeof(T)] = *(T*)&ptr[i * stride + j * sizeof(T)] * factor;
}
CK_CUDA(cuMemcpyHtoD_v2(in_image_dev_ptr, host, d->in_size()));

// Pass the pointers to the GPU allocations to the
// parameter block along with the format and size.
params->Set(NVSDK_NGX_Parameter_Color_SizeInBytes, d->in_size());
params->Set(NVSDK_NGX_Parameter_Color_Format, NVSDK_NGX_Buffer_Format_RGB32F);
params->Set(NVSDK_NGX_Parameter_Color, in_image_dev_ptr);
params->Set(NVSDK_NGX_Parameter_Output_SizeInBytes, d->out_size());
params->Set(NVSDK_NGX_Parameter_Output_Format, NVSDK_NGX_Buffer_Format_RGB32F);
params->Set(NVSDK_NGX_Parameter_Output, out_image_dev_ptr);

// Execute the feature.
CK_NGX(NVSDK_NGX_CUDA_EvaluateFeature(d->DUHandle, params, nullptr));

host = d->out_host.data();
CK_CUDA(cuMemcpyDtoH_v2(host, out_image_dev_ptr, d->out_size()));
for (int plane = 0; plane < 3; plane++) {
const size_t stride = vsapi->getStride(dst, plane);
uint8_t *ptr = (uint8_t*)vsapi->getWritePtr(dst, plane);
for (size_t i = 0; i < d->out_image_height(); i++)
for (size_t j = 0; j < d->out_image_width(); j++)
*(T*)&ptr[i * stride + j * sizeof(T)] = *(T*)&host[i * d->out_image_row_bytes() + j * d->pixel_size() + plane * sizeof(T)] / factor;
}

cuCtxPopCurrent(nullptr);

vsapi->freeFrame(src);
return dst;
}

return nullptr;
}

static void VS_CC ngxFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
NgxData *d = static_cast<NgxData *>(instanceData);
vsapi->freeNode(d->node);

delete d;
}

static void VS_CC ngxCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
std::unique_ptr<NgxData> d(new NgxData);
int err;

try {
if (autoDllErrors.size() > 0) {
std::string error, last;
for (const auto &s: autoDllErrors) {
if (error.size()) {
if (last != s)
error += "; " + s;
} else
error = s;
last = s;
}
throw std::runtime_error(error);
}

d->node = vsapi->propGetNode(in, "clip", 0, &err);
d->vi = *vsapi->getVideoInfo(d->node);

if (!isConstantFormat(&d->vi)) {
throw std::runtime_error("Only clips with constant format and dimensions allowed");
}
if (d->vi.format->numPlanes != 3 || d->vi.format->colorFamily != cmRGB)
throw std::runtime_error("input clip must be RGB format");
if (d->vi.format->sampleType != stFloat || d->vi.format->bitsPerSample != 32)
throw std::runtime_error("input clip must be 32-bit float format");

int scale = int64ToIntS(vsapi->propGetInt(in, "scale", 0, &err));
if (err) scale = 2;
if (scale != 2 && scale != 4 && scale != 8)
throw std::runtime_error("scale must be 2/4/8");
d->scale = scale;
} catch (std::runtime_error &e) {
if (d->node)
vsapi->freeNode(d->node);
vsapi->setError(out, (std::string{ "DLISR: " } + e.what()).c_str());
return;
}

d->vi.width *= d->scale;
d->vi.height *= d->scale;

static bool inited = []() -> bool {
CUcontext ctx;
bool hasCtx = cuCtxGetCurrent(&ctx) == CUDA_SUCCESS;
CK_NGX(NVSDK_NGX_CUDA_Init(0, L"./", NVSDK_NGX_Version_API));
// We don't expect NVSDK_NGX_CUDA_Init to create a context, but if it did, we need to
// switch to save a global CUDA context, instead of the pre-filter context.
if (!hasCtx && cuCtxGetCurrent(&ctx) == CUDA_SUCCESS) {
fprintf(stderr, "invariant violated: NVSDK_NGX_CUDA_Init created CUDA context: %p\n", ctx);
abort();
}
return true;
}();
(void) inited;
NV_new_Parameter(&d->param);

d->param->Set(NVSDK_NGX_Parameter_Width, d->in_image_width());
d->param->Set(NVSDK_NGX_Parameter_Height, d->in_image_height());
d->param->Set(NVSDK_NGX_Parameter_Scale, d->scale);

// Get the scratch buffer size and create the scratch allocation.
size_t byteSize{ 0u };
CK_NGX(NVSDK_NGX_CUDA_GetScratchBufferSize(NVSDK_NGX_Feature_ImageSuperResolution, d->param, &byteSize));
if (byteSize != 0) // should request none.
abort();

// Create the feature
CK_NGX(NVSDK_NGX_CUDA_CreateFeature(NVSDK_NGX_Feature_ImageSuperResolution, d->param, &d->DUHandle));
CK_CUDA(cuCtxGetCurrent(&d->ctx));
d->allocate();
CK_CUDA(cuCtxPopCurrent(nullptr));

vsapi->createFilter(in, out, "DLISR", ngxInit, ngxGetFrame, ngxFree, fmParallel, 0, d.release(), core);
}

//////////////////////////////////////////
// Init

#ifndef STANDALONE_NGX
void VS_CC ngxInitialize(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) {
#else
extern "C" void VS_CC VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) {
configFunc("info.akarin.plugin", "akarin2", "Experimental Nvidia DLISR plugin", VAPOURSYNTH_API_VERSION, 1, plugin);
#endif
registerFunc("DLISR", "clip:clip;scale:int:opt", ngxCreate, nullptr, plugin);
}
Loading

0 comments on commit 85a4077

Please sign in to comment.