From 1ec5a82bfe3f9e74e5c77502d22112f07109c36f Mon Sep 17 00:00:00 2001 From: yhtq <1414672068@qq.com> Date: Thu, 5 Dec 2024 15:41:46 +0800 Subject: [PATCH] 12.5 --- .gitignore | 3 +- ...\346\234\272\346\212\245\345\221\2122.typ" | 7 +- .../main.typ" | 71 +++- .../\344\275\234\344\270\232/ml-5_2-hw.typ" | 84 ++++ .../code6/.gitignore" | 163 +++++++ .../code6/README.md" | 1 + .../code6/pdm.lock" | 402 ++++++++++++++++++ .../code6/pyproject.toml" | 18 + .../code6/src/code6/main.py" | 205 +++++++++ .../code6/train_output" | 129 ++++++ .../main.typ" | 67 +++ .../\344\275\234\344\270\232/hw6.typ" | 141 ++++++ 12 files changed, 1285 insertions(+), 6 deletions(-) create mode 100644 "\346\225\260\347\220\206\351\200\273\350\276\221/\344\275\234\344\270\232/ml-5_2-hw.typ" create mode 100644 "\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/.gitignore" create mode 100644 "\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/README.md" create mode 100644 "\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/pdm.lock" create mode 100644 "\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/pyproject.toml" create mode 100644 "\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/src/code6/main.py" create mode 100644 "\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/train_output" create mode 100644 "\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/\344\275\234\344\270\232/hw6.typ" diff --git a/.gitignore b/.gitignore index fa23b12..047b2fc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ 忽略/** **/**.zip **/target -.VSCodeCounter/** \ No newline at end of file +.VSCodeCounter/** +data/** \ No newline at end of file diff --git "a/\345\271\266\350\241\214\344\270\216\345\210\206\345\270\203\345\274\217\350\256\241\347\256\227/\344\270\212\346\234\272\346\212\245\345\221\212/\344\270\212\346\234\272\346\212\245\345\221\2122.typ" "b/\345\271\266\350\241\214\344\270\216\345\210\206\345\270\203\345\274\217\350\256\241\347\256\227/\344\270\212\346\234\272\346\212\245\345\221\212/\344\270\212\346\234\272\346\212\245\345\221\2122.typ" index e359aa9..6187964 100644 --- "a/\345\271\266\350\241\214\344\270\216\345\210\206\345\270\203\345\274\217\350\256\241\347\256\227/\344\270\212\346\234\272\346\212\245\345\221\212/\344\270\212\346\234\272\346\212\245\345\221\2122.typ" +++ "b/\345\271\266\350\241\214\344\270\216\345\210\206\345\270\203\345\274\217\350\256\241\347\256\227/\344\270\212\346\234\272\346\212\245\345\221\212/\344\270\212\346\234\272\346\212\245\345\221\2122.typ" @@ -167,7 +167,7 @@ GCC 编译器可以自动利用向量化指令完成某些运算,但需要非常多的额外信息,包括内存对齐,非别名等等。编译时开启 `-fopt-info-vec-all` 可以帮助我们知道向量化失败的原因。本次实现的初版矩阵乘法为: ```cpp template -Matrix mul_parallel (const Matrix& a, const Matrix& b) { + Matrix mul_parallel (const Matrix& a, const Matrix& b) { if constexpr (debug){ if (a.ncols() != b.nrows()) { fmt::print("Error: Matrix size not match!\n"); @@ -184,7 +184,7 @@ Matrix mul_parallel (const Matrix& a, const Matrix& b T* c_data = c.get(); const T* a_data = a.get(); const T* b_data = b.get(); - 以下 assume 可以帮助进行自动向量化 + // 以下 assume 可以帮助进行自动向量化 __builtin_assume_aligned(c_data, BYTE_ALIGNMENT); __builtin_assume_aligned(a_data, BYTE_ALIGNMENT); __builtin_assume_aligned(b_data, BYTE_ALIGNMENT); @@ -260,7 +260,7 @@ Matrix mul_parallel (const Matrix& a, const Matrix& b ``` 可以很大程度上避免忘记 gap 造成的 Bug === 循环展开 - 事实上,不难发现核心循环的最内层循环次数并不多,因此循环展开可能获得很大的收益。这里最开始担心编译器无法充分优化,手动写了一层展开: + 不难发现核心循环的最内层循环次数并不多,因此循环展开可能获得很大的收益。这里最开始担心编译器无法充分优化,手动写了一层展开: ```cpp // 由于一个 cache_line 事实上只有两个向量,我们手动展开 template @@ -507,4 +507,5 @@ Matrix mul_parallel (const Matrix& a, const Matrix& b - 线程数为 $1, 2, 4, 8$ 时,运行时间几乎线性下降,这和我们之前的分析是一致的。 - 线程数为 $16$ 时,并没有获得线性加速。推测是因为服务器使用的是 12 个逻辑核的 E5-2650 v4,对于矩阵乘法这样运算非常密集的任务,超线程带来的性能提升当然比不上物理线程。 - $m = n = p = 2048$ 时,矩阵乘法花费约 400ms,性能已经比较好。但是可以看到,与 1024 的实验相比,缓存失效率暴增,说明还有很大的优化空间。 + - 每个实验中,分支预测失败率都非常低,这或许是编译器充分循环展开带来的收益。 - 每个实验中,平均一个 CPU 周期都完成了两条以上的指令,可见现代 CPU 为了尽可能提高性能,也做出了相当大的努力。 diff --git "a/\346\225\260\347\220\206\351\200\273\350\276\221/main.typ" "b/\346\225\260\347\220\206\351\200\273\350\276\221/main.typ" index bba1681..4b12d67 100644 --- "a/\346\225\260\347\220\206\351\200\273\350\276\221/main.typ" +++ "b/\346\225\260\347\220\206\351\200\273\350\276\221/main.typ" @@ -1270,7 +1270,7 @@ ] #proposition[][ 对于任何无穷基数 $m$,任何一致一阶系统都有基数为 $m$ 的模型 - ] + ] #theorem[Compactness][ 若一个一阶系统 $S$ 的公理集的任何有限子集都有模型,则 $S$ 有模型 ] @@ -1364,4 +1364,71 @@ + $1(1 1) = 1 := #transitivity-b(3, 4)$ ] ] - ] \ No newline at end of file + ] + == 一阶算术 + #definition[算术语言][ + 定义一阶语言,包含: + - 常元: $0$ + - 函项符: $s$(后继)$, +, *$ + - 谓词符: $=$ + 以及公理: + + N1: $not1 (s(x_1) = 0)$ + + N2: $s(x_1) = s(x_2) -> x_1 = x_2$ + + N3: $x_1 + 0 = x_1$ + + N4: $x_1 + s(x_2) = s(x_1 + x_2)$ + + N5: $x_1 * 1 = x_1$ + + N6: $x_1 * s(x_2) = x_1 * x_2 + x_1$ + + N7: $calA(0) -> (forall x_1 (calA(x_1) -> calA(s(x_1)))) -> (forall x_1 calA(x_1))$,其中 $x_1$ 在 $calA$ 中自由出现 + + 记 $0^((n))$ 代表 $0$ 的 $n$ 次后继。显然它是一个闭项。 + 一般的,称包含上面定理和公理产生的定理集的一阶理论为一阶算术。 + ] + 注意 N7 理论上不等价于通常的 Peano 公理中的数学归纳法,后者讨论的是所有自然数子集上的性质,这是不可枚举的。而 N7 只是公理模式,它是可数多个公理。 + #lemma[][ + $m = n <=> 0^((m)) = 0^((n))$ + ] + #proof[ + 使用归纳法即可 + ] + #proposition[][ + 任何 $NN$ 的模型都是无穷的。 + ] + 显然,通常的自然数就是一阶算术的一个模型。看起来,这可以表明 $NN$ 是一致的系统。然而,“通常的自然数”这个说法本身依赖于其他数学基础(如何定义自然数),这是不可靠的。事实上,如果这确实是标准模型,可以想象其他任何模型和标准模型几乎没有区别。因此只要 $NN$ 是一致的,它就几乎是完全的。 + #theorem[Godel 不完全性定理][ + 算术系统 $NN$ 是不完全的 + ] + 它的证明我们后面会介绍 + == 形式集合论 + 在朴素集论中,人们遇到了 Russel 悖论。为了消除 Russel 悖论,主要有两种思路。一种是 Russel 自己提出的类型论,但在数学中更常见的是 Zermelo-Fraenkel 公理集合论。 + #definition[一阶 ZF 语言][ + 定义 ZF 语言: + - 没有常元或函项符 + - 谓词符: $=, in$ + - 常用的缩写: + - $t_1 subset.eq t_2 := forall x_1 (x_1 in t_1 -> x in t_2)$ + - $t_1 subset t_2 := t_1 subset.eq t_2 and t_1 != t_2$ + - 公理: + + ZF1: $x_1 = x_2 <-> forall x_3 (x_3 in x_1 <-> x_3 in x_2)$(外延公理)(作为推论,可以证明 $x_1 = x_2 <-> x_1 subset.eq x_2 and x_2 subset.eq x_1$ + + ZF2: $exists x_1, forall x_2, not1 x_2 in x_1$(空集公理)(事实上,由 Z1 可以证明在规范模型中,空集是唯一的,因此往往记作 $emptyset$) + + ZF3: $exists x_3 forall x_4 (x_4 in x_3 <-> x_4 = x_1 or x_4 = x_2 )$(对集公理)(事实上,可以证明是存在唯一的,因此往往将其记作 ${x_1, x_2}$ + + ZF4: $exists x_2, forall x_3 (x_3 in x_2 <-> exists x_4 (x_4 in x_1 and x_3 in x_4))$(并集公理,存在一个集合是某个集合所有元素的并,进而可以定义 $t_1 union t_2 := union.big {t_1, t_2}$ + + ZF5: $exists x_2 (forall x_3, x_3 in x_2 <-> x_3 subset x_1)$(幂集公理,存在集合是某个集合的所有子集的集合) + + ZF6: $(forall x_1, exists_1 x_2 calA(x_1, x_2)) -> (forall x_3 exists x_4 forall x_5(x_5 in x_4 <-> exists x_6 (x_6 in x_3 and calA(x_6, x_5))))$(替换公理模式,相当于对于任意集合 $x$ 和"函项" $f$,都存在集合 $f(x)$ + + ZF7: $exists x_1 (emptyset in x_1 and (forall x_2 (x_2 in x_1 -> x_2 union {x_2} in x_1)))$(无穷公理,也就是无穷集(归纳集)存在) + ] + #remark[][ + - ZF 公理体系中没有非集合的个体,但事实上也可以加入非集合的个体,这种扩充往往称为 ZFA + ] + *假设 ZF 是一致的,也即存在模型*,则以它的规范模型为基础,可以建立所有数学的基础。 + + 除了 ZF 的公理外,还有一些已经被证明独立于 ZF (换言之,若 ZF 是一致的,则加上它们之后一致的)的公式,包括: + - 选择公理(等价于 Zorn 引理和良序定理) + - 连续统假设(CH) + + 人们往往认为,以集为数学基础是安全的。哪怕 ZF 系统被发现存在悖论,也能通过添加公理排除掉。如果我们把朴素集称为类,则可以将不是 ZFC 中的集合的类称为真类。NGB 公理集合论选择通过排除真类来避免 Russel 悖论。 + == 一致性问题 + 前面讨论一阶逻辑的一致性时,采用了朴素集合论的语言。然而 ZF 系统本身就在定义集合,再使用朴素集合论似乎有些循环。例如,@model-cardinality 给出 ZF 应该有可枚举的模型,而 ZF 中可以证明存在不可枚举的模型,听起来也十分荒谬。 + #proposition()[相对一致性][ + 设 $S^*$ 是 $S$ 的一个扩充,若 $S^*$ 是一致的,则 $S$ 是一致的 + ] + 一阶逻辑具有绝对一致性,然而一阶逻辑的扩充,例如 ZF 系统,是否具有某种的一致性仍然是一个未解问题。 \ No newline at end of file diff --git "a/\346\225\260\347\220\206\351\200\273\350\276\221/\344\275\234\344\270\232/ml-5_2-hw.typ" "b/\346\225\260\347\220\206\351\200\273\350\276\221/\344\275\234\344\270\232/ml-5_2-hw.typ" new file mode 100644 index 0000000..818e6fd --- /dev/null +++ "b/\346\225\260\347\220\206\351\200\273\350\276\221/\344\275\234\344\270\232/ml-5_2-hw.typ" @@ -0,0 +1,84 @@ +#import "../../template.typ": proof, note, corollary, lemma, theorem, definition, example, remark +#import "../../template.typ": * +#import "../main.typ": not1, True, False, infer +#import "../main.typ": * +#show: note.with( + title: "作业5_1", + author: "YHTQ", + date: datetime.today().display(), + logo: none, + withOutlined : false, + withTitle : false, + withHeadingNumbering: false +) +#set heading(numbering: + (..nums) => + { + let nums1 = nums.pos() + nums1.insert(0, 12) + numbering("1.1.(a)", ..nums1) + } +) += #empty + 首先证明: + $ + forall y (forall z (z < y -> calA(z)) -> calA(y)) tack calA(0) + $ + #deduction()[ + + $not1 (z < x) :=$ 基本结论 + + $not1 (z < x) -> (z < x -> calA(z)) := tauto$ + + $z < x -> calA(z) := $ MP + + $forall z (z < x -> calA(z)) := GEN$ + + $calA(x) := $ MP + ] + 再证明: + $ + forall y (forall z (z < y -> calA(z)) -> calA(y)) tack calA(x) + $ + #let basic = "基本结论" + 令 $calB(x') := forall z, z <= x' -> calA(z)$,有: + #deduction[ + + $calB(0) -> (forall x_1 (calB(x_1) -> calB(s(x_1)))) -> calB(x) := $ N7 + + $calA(0) :=$ 前已证 + + $z <= 0 -> z = 0 := basic$ + + $z = 0 -> calA(0) := $ E8 + + $calB(0) := $ transitivity + + $(forall z (z < s(x_1) -> calA(z))) -> calA(s(x_1)) := forall$ elim + + $(forall z (z < s(x_1) -> calA(z))) <-> calB(x_1) := basic, $ 可证等价替换性 + + $calB(x_1) -> calA(s(x_1)) := $ 可证等价替换性 + + $calB(s(x_1)) <-> (forall z (z <= x_1 or z = s(x_1)) -> calA(z)) := $ 可证等价替换性 + + $forall z (z <= x_1 or z = s(x_1)) -> calA(z) <-> forall z (z <= x_1 -> calA(z)) and (z = s(x_1) -> calA(z)) := tauto, $替换 + + $forall z (z <= x_1 -> calA(z)) and (z = s(x_1) -> calA(z)) <-> forall z (z <= x_1 -> calA(z)) and forall z(z = s(x_1) -> calA(z)) := forall and$ + + $calB(s(x_1)) <-> calB(x_1) and forall z(z = s(x_1) -> calA(z)) := $ transitivity + + $forall z(z = s(x_1) -> calA(z)) <-> forall z(z = s(x_1) -> calA(s(x_1))) := $ 可证等价替换性 + + $forall z(z = s(x_1) -> calA(s(x_1))) <-> (exists z (z = s(x_1))) -> calA(s(x_1))$ + + $(exists z (z = s(x_1))) := exists$ intro $s(x_1)$ + + $forall z(z = s(x_1) -> calA(s(x_1))) <-> calA(s(x_1)) := $ 可证等价替换性 + + $calB(s(x_1)) <-> calB(x_1) and calA(s(x_1))$ + + $(calB(x_1) -> calA(s(x_1))) -> (calB(x_1) -> (calB(x_1) and calA(s(x_1)))) := tauto$ + + $calB(x_1) -> (calB(x_1) and calA(s(x_1))) := MPb(8, 18)$ + + $calB(x_1) -> calB(s(x_1)) := $ 可证等价替换性 + + $forall x_1 (calB(x_1) -> calB(s(x_1))) := GEN$ + + $calB(x) := $ MP 5, 21, 1 + + $x <= x -> calA(x) := forall$ elim x + + $x <= x := basic$ + + $calA(x) := $ MP + // + $(forall x_1 (calB(x_1) -> calB(s(x_1)))) -> calB(x) := $ MP + // + $(forall z' (z' < s(x_1) -> calA(s(x_1))) -> calA(s(x_1))) := forall$ elim + // + $x_1 = x -> not1 s(x_1) < x := basic$ + // + $not1 s(x_1) < x -> calB(x_1) := fA$ + // + $s(x_1) = x -> calB(s(x_1)) := $ transitivity + // + $s(x_1) > x -> not1 s(x_1) < x := basic$ + // + $s(x_1) > x -> calB(s(x_1)) := $ transitivity + // + $s(x_1) < x -> x_1 < x := basic$ + // + $x_1 < x -> calB(x_1) -> calA(x_1) := tauto$ + // + $s(x_1) < x -> calB(x_1) -> calA(x_1) := $ transitivity + // + $s(x_1) < x -> calB(x_1) -> (s(x_1) < x -> calA(s(x_1)))$ + // + $0 < y + 1 := $ 基本结论 + // + $0 < x := $ E8, MP + // + $$ + ] + 由于左侧都是闭式,演绎定理立刻给出原结论 += #empty + 目前还没有发现 ZF 中存在矛盾,因此 ZF 中没有目前已知的矛盾。ZFC 公理系统提供了足够强的构造能力,绝大多数数学系统(例如算术系统)都可以构造于 ZFC 系统中。换言之,只要 ZFC 系统是一致的/有模型,则这些具体的数学系统也是一致的/有模型的,这就将不同数学分支所研究的数学系统的一致性问题全部转化到了 ZFC 系统的一致性问题上。 + diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/.gitignore" "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/.gitignore" new file mode 100644 index 0000000..d34d673 --- /dev/null +++ "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/.gitignore" @@ -0,0 +1,163 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm-project.org/#use-with-ide +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ +*.ckpt + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/README.md" "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/README.md" new file mode 100644 index 0000000..9d9a24c --- /dev/null +++ "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/README.md" @@ -0,0 +1 @@ +# code6 diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/pdm.lock" "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/pdm.lock" new file mode 100644 index 0000000..76aa52f --- /dev/null +++ "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/pdm.lock" @@ -0,0 +1,402 @@ +# This file is @generated by PDM. +# It is not intended for manual editing. + +[metadata] +groups = ["default"] +strategy = ["inherit_metadata"] +lock_version = "4.5.0" +content_hash = "sha256:0f3090956348c6f577b18f5d4c57f0de852c41f6f09948cc1b3802f5f5c49132" + +[[metadata.targets]] +requires_python = "==3.12.*" + +[[package]] +name = "filelock" +version = "3.16.1" +requires_python = ">=3.8" +summary = "A platform independent file lock." +groups = ["default"] +files = [ + {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"}, + {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"}, +] + +[[package]] +name = "fsspec" +version = "2024.10.0" +requires_python = ">=3.8" +summary = "File-system specification" +groups = ["default"] +files = [ + {file = "fsspec-2024.10.0-py3-none-any.whl", hash = "sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871"}, + {file = "fsspec-2024.10.0.tar.gz", hash = "sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493"}, +] + +[[package]] +name = "jinja2" +version = "3.1.4" +requires_python = ">=3.7" +summary = "A very fast and expressive template engine." +groups = ["default"] +dependencies = [ + "MarkupSafe>=2.0", +] +files = [ + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, +] + +[[package]] +name = "markupsafe" +version = "3.0.2" +requires_python = ">=3.9" +summary = "Safely add untrusted strings to HTML/XML markup." +groups = ["default"] +files = [ + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"}, + {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +summary = "Python library for arbitrary-precision floating-point arithmetic" +groups = ["default"] +marker = "python_version >= \"3.9\"" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[[package]] +name = "networkx" +version = "3.4.2" +requires_python = ">=3.10" +summary = "Python package for creating and manipulating graphs and networks" +groups = ["default"] +files = [ + {file = "networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f"}, + {file = "networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1"}, +] + +[[package]] +name = "numpy" +version = "2.1.3" +requires_python = ">=3.10" +summary = "Fundamental package for array computing in Python" +groups = ["default"] +files = [ + {file = "numpy-2.1.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e"}, + {file = "numpy-2.1.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958"}, + {file = "numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8"}, + {file = "numpy-2.1.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564"}, + {file = "numpy-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512"}, + {file = "numpy-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b"}, + {file = "numpy-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc"}, + {file = "numpy-2.1.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0"}, + {file = "numpy-2.1.3-cp312-cp312-win32.whl", hash = "sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9"}, + {file = "numpy-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a"}, + {file = "numpy-2.1.3.tar.gz", hash = "sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761"}, +] + +[[package]] +name = "nvidia-cublas-cu12" +version = "12.4.5.8" +requires_python = ">=3" +summary = "CUBLAS native runtime libraries" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"}, + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"}, + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc"}, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.4.127" +requires_python = ">=3" +summary = "CUDA profiling tools runtime libs." +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"}, + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"}, + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922"}, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.4.127" +requires_python = ">=3" +summary = "NVRTC native runtime libraries" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"}, + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"}, + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec"}, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.4.127" +requires_python = ">=3" +summary = "CUDA Runtime native Libraries" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"}, + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"}, + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e"}, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.1.0.70" +requires_python = ">=3" +summary = "cuDNN runtime libraries" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +dependencies = [ + "nvidia-cublas-cu12", +] +files = [ + {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"}, + {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"}, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.2.1.3" +requires_python = ">=3" +summary = "CUFFT native runtime libraries" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +dependencies = [ + "nvidia-nvjitlink-cu12", +] +files = [ + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"}, + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"}, + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b"}, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.5.147" +requires_python = ">=3" +summary = "CURAND native runtime libraries" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"}, + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"}, + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771"}, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.6.1.9" +requires_python = ">=3" +summary = "CUDA solver native runtime libraries" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +dependencies = [ + "nvidia-cublas-cu12", + "nvidia-cusparse-cu12", + "nvidia-nvjitlink-cu12", +] +files = [ + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"}, + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"}, + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c"}, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.3.1.170" +requires_python = ">=3" +summary = "CUSPARSE native runtime libraries" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +dependencies = [ + "nvidia-nvjitlink-cu12", +] +files = [ + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"}, + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"}, + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f"}, +] + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.21.5" +requires_python = ">=3" +summary = "NVIDIA Collective Communication Library (NCCL) Runtime" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"}, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.4.127" +requires_python = ">=3" +summary = "Nvidia JIT LTO Library" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"}, + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"}, + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"}, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.4.127" +requires_python = ">=3" +summary = "NVIDIA Tools Extension" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"}, + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"}, + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"}, +] + +[[package]] +name = "pillow" +version = "11.0.0" +requires_python = ">=3.9" +summary = "Python Imaging Library (Fork)" +groups = ["default"] +files = [ + {file = "pillow-11.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2c0a187a92a1cb5ef2c8ed5412dd8d4334272617f532d4ad4de31e0495bd923"}, + {file = "pillow-11.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:084a07ef0821cfe4858fe86652fffac8e187b6ae677e9906e192aafcc1b69903"}, + {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8069c5179902dcdce0be9bfc8235347fdbac249d23bd90514b7a47a72d9fecf4"}, + {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f02541ef64077f22bf4924f225c0fd1248c168f86e4b7abdedd87d6ebaceab0f"}, + {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fcb4621042ac4b7865c179bb972ed0da0218a076dc1820ffc48b1d74c1e37fe9"}, + {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:00177a63030d612148e659b55ba99527803288cea7c75fb05766ab7981a8c1b7"}, + {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8853a3bf12afddfdf15f57c4b02d7ded92c7a75a5d7331d19f4f9572a89c17e6"}, + {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3107c66e43bda25359d5ef446f59c497de2b5ed4c7fdba0894f8d6cf3822dafc"}, + {file = "pillow-11.0.0-cp312-cp312-win32.whl", hash = "sha256:86510e3f5eca0ab87429dd77fafc04693195eec7fd6a137c389c3eeb4cfb77c6"}, + {file = "pillow-11.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:8ec4a89295cd6cd4d1058a5e6aec6bf51e0eaaf9714774e1bfac7cfc9051db47"}, + {file = "pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25"}, + {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"}, +] + +[[package]] +name = "setuptools" +version = "75.6.0" +requires_python = ">=3.9" +summary = "Easily download, build, install, upgrade, and uninstall Python packages" +groups = ["default"] +marker = "python_version >= \"3.12\"" +files = [ + {file = "setuptools-75.6.0-py3-none-any.whl", hash = "sha256:ce74b49e8f7110f9bf04883b730f4765b774ef3ef28f722cce7c273d253aaf7d"}, + {file = "setuptools-75.6.0.tar.gz", hash = "sha256:8199222558df7c86216af4f84c30e9b34a61d8ba19366cc914424cdbd28252f6"}, +] + +[[package]] +name = "sympy" +version = "1.13.1" +requires_python = ">=3.8" +summary = "Computer algebra system (CAS) in Python" +groups = ["default"] +marker = "python_version >= \"3.9\"" +dependencies = [ + "mpmath<1.4,>=1.1.0", +] +files = [ + {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"}, + {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"}, +] + +[[package]] +name = "torch" +version = "2.5.1" +requires_python = ">=3.8.0" +summary = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" +groups = ["default"] +dependencies = [ + "filelock", + "fsspec", + "jinja2", + "networkx", + "nvidia-cublas-cu12==12.4.5.8; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-cuda-cupti-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-cuda-runtime-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-cudnn-cu12==9.1.0.70; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-cufft-cu12==11.2.1.3; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-curand-cu12==10.3.5.147; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-cusolver-cu12==11.6.1.9; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-cusparse-cu12==12.3.1.170; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-nccl-cu12==2.21.5; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-nvjitlink-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "nvidia-nvtx-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\"", + "setuptools; python_version >= \"3.12\"", + "sympy==1.12.1; python_version == \"3.8\"", + "sympy==1.13.1; python_version >= \"3.9\"", + "triton==3.1.0; platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\"", + "typing-extensions>=4.8.0", +] +files = [ + {file = "torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03"}, + {file = "torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697"}, + {file = "torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c"}, + {file = "torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1"}, +] + +[[package]] +name = "torchvision" +version = "0.20.1" +requires_python = ">=3.8" +summary = "image and video datasets and models for torch deep learning" +groups = ["default"] +dependencies = [ + "numpy", + "pillow!=8.3.*,>=5.3.0", + "torch==2.5.1", +] +files = [ + {file = "torchvision-0.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1a31256ff945d64f006bb306813a7c95a531fe16bfb2535c837dd4c104533d7a"}, + {file = "torchvision-0.20.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:17cd78adddf81dac57d7dccc9277a4d686425b1c55715f308769770cb26cad5c"}, + {file = "torchvision-0.20.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:9f853ba4497ac4691815ad41b523ee23cf5ba4f87b1ce869d704052e233ca8b7"}, + {file = "torchvision-0.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:4a330422c36dbfc946d3a6c1caec3489db07ecdf3675d83369adb2e5a0ca17c4"}, +] + +[[package]] +name = "triton" +version = "3.1.0" +summary = "A language and compiler for custom Deep Learning operations" +groups = ["default"] +marker = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\"" +dependencies = [ + "filelock", +] +files = [ + {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"}, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +requires_python = ">=3.8" +summary = "Backported and Experimental Type Hints for Python 3.8+" +groups = ["default"] +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/pyproject.toml" "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/pyproject.toml" new file mode 100644 index 0000000..69d7e0d --- /dev/null +++ "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/pyproject.toml" @@ -0,0 +1,18 @@ +[project] +name = "code6" +version = "0.1.0" +description = "Default template for PDM package" +authors = [ + {name = "yhtq", email = "1414672068@qq.com"}, +] +dependencies = ["torch>=2.5.1", "torchvision>=0.20.1"] +requires-python = "==3.12.*" +readme = "README.md" +license = {text = "MIT"} + + +[tool.pdm] +distribution = false + +[tool.pdm.scripts] +start = "src/code6/main.py" diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/src/code6/main.py" "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/src/code6/main.py" new file mode 100644 index 0000000..466a8a4 --- /dev/null +++ "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/src/code6/main.py" @@ -0,0 +1,205 @@ +from typing import Type +import torch +import torch.nn as nn +import torchvision +import torchvision.transforms as transforms + + +# Device configuration +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +if torch.cuda.is_available(): + print('GPU is available') +else: + print('GPU is not available') +# Hyper-parameters +input_size = 784 +hidden_size = 500 +hidden_size2 = 300 +num_classes = 10 +num_epochs = 5 +batch_size = 100 +learning_rate = 0.001 + +# MNIST dataset +train_dataset = torchvision.datasets.MNIST(root='../../data', + train=True, + transform=transforms.ToTensor(), + download=True) + +test_dataset = torchvision.datasets.MNIST(root='../../data', + train=False, + transform=transforms.ToTensor()) + +# Data loader +train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=batch_size, + shuffle=True) + +test_loader = torch.utils.data.DataLoader(dataset=test_dataset, + batch_size=batch_size, + shuffle=False) + +# Fully connected neural network with one hidden layer +class NeuralNet(nn.Module): + def __init__(self, input_size, hidden_size, num_classes): + super(NeuralNet, self).__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(hidden_size, hidden_size2) + self.relu2 = nn.ReLU() + self.fc3 = nn.Linear(hidden_size2, num_classes) + + def forward(self, x): + out = self.fc1(x) + out = self.relu(out) + out = self.fc2(out) + out = self.relu2(out) + out = self.fc3(out) + return out + +class MySGD(torch.optim.Optimizer): + def __init__(self, params, lr=0.01): + defaults = dict(lr=lr) + super(MySGD, self).__init__(params, defaults) + + def step(self, closure=None): + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + d_p = p.grad + p.data.add_(-group['lr'], d_p) + + def print_info(self) -> str: + return f"SGD with lr={self.param_groups[0]['lr']}" + +class MySGD_WithHeavyBall(torch.optim.Optimizer): + def __init__(self, params, lr=0.01, momentum_factor=0.9): + defaults = dict(lr=lr, momentum_factor=momentum_factor) + super(MySGD_WithHeavyBall, self).__init__(params, defaults) + for group in self.param_groups: + for p in group['params']: + self.state[p]['v'] = torch.zeros_like(p.data) + + def step(self, closure=None): + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + d_p = p.grad + # v_(t + 1) = beta v_t - grad + # x_(t + 1) = x_t + lr v_(t + 1) + v = self.state[p]['v'] + v.mul_(group['momentum_factor']).sub_(d_p) + p.data.add_(group['lr'], v) + + def print_info(self) -> str: + return f"SGD with Heavy Ball with lr={self.param_groups[0]['lr']}, momentum_factor={self.param_groups[0]['momentum_factor']}" + +class MySGD_WithNesterov(torch.optim.Optimizer): + def __init__(self, params, lr=0.01, momentum_factor=0.9): + defaults = dict(lr=lr, momentum_factor=momentum_factor) + super(MySGD_WithNesterov, self).__init__(params, defaults) + self.t = 0 + for group in self.param_groups: + for p in group['params']: + self.state[p]['x'] = torch.zeros_like(p.data) + + def step(self, closure=None): + beta_t: float = (self.t - 1) / (self.t + 2) + self.t += 1 + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + d_p = p.grad + # x_(t + 1) = p_t - eta grad + # p_(t + 1) = x_(t + 1) + beta_t (x_(t + 1) - x_t) + x_t = self.state[p]['x'] + x_t1 = p.data.sub(d_p, alpha = group['lr']) + p.data = x_t1.add(x_t1.sub(x_t), alpha = beta_t) + x_t.copy_(x_t1) + + def print_info(self) -> str: + return f"SGD with Nesterov with lr={self.param_groups[0]['lr']}, momentum_factor={self.param_groups[0]['momentum_factor']}" + +class MyAdam(torch.optim.Optimizer): + def __init__(self, params, lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-8): + defaults = dict(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon) + super(MyAdam, self).__init__(params, defaults) + self.t = 0 + for group in self.param_groups: + for p in group['params']: + self.state[p]['m'] = torch.zeros_like(p.data) + self.state[p]['v'] = torch.zeros_like(p.data) + + def step(self, closure=None): + self.t += 1 + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + d_p = p.grad + m_t = self.state[p]['m'] + v_t = self.state[p]['v'] + m_t.mul_(group['beta_1']).add_(d_p, alpha = 1 - group['beta_1']) + v_t.mul_(group['beta_2']).addcmul_(d_p, d_p, value = 1 - group['beta_2']) + m_hat = m_t / (1 - group['beta_1'] ** self.t) + v_hat = v_t / (1 - group['beta_2'] ** self.t) + + p.data.sub_(m_hat / (v_hat.sqrt() + group['epsilon']), alpha = group['lr']) + + def print_info(self) -> str: + return f"Adam with lr={self.param_groups[0]['lr']}, beta_1={self.param_groups[0]['beta_1']}, beta_2={self.param_groups[0]['beta_2']}, epsilon={self.param_groups[0]['epsilon']}" + +def train_opt(opt: Type[torch.optim.Optimizer], lr: float) -> None: + model = NeuralNet(input_size, hidden_size, num_classes).to(device) + # Loss and optimizer + criterion = nn.CrossEntropyLoss() + # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + optimizer = opt(model.parameters(), lr=lr) + print(optimizer.print_info()) + + # Train the model + total_step = len(train_loader) + for epoch in range(num_epochs): + for i, (images, labels) in enumerate(train_loader): + # Move tensors to the configured device + images = images.reshape(-1, 28*28).to(device) + labels = labels.to(device) + + # Forward pass + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if (i+1) % 100 == 0: + print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' + .format(epoch+1, num_epochs, i+1, total_step, loss.item())) + + # Test the model + # In test phase, we don't need to compute gradients (for memory efficiency) + with torch.no_grad(): + correct = 0 + total = 0 + for images, labels in test_loader: + images = images.reshape(-1, 28*28).to(device) + labels = labels.to(device) + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total)) + + # Save the model checkpoint + torch.save(model.state_dict(), f'{optimizer.print_info()}.ckpt') + +train_opt(MySGD, 0.05) +train_opt(MySGD_WithHeavyBall, 0.03) +train_opt(MySGD_WithNesterov, 0.01) +train_opt(MyAdam, 0.001) \ No newline at end of file diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/train_output" "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/train_output" new file mode 100644 index 0000000..f6c4282 --- /dev/null +++ "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/code6/train_output" @@ -0,0 +1,129 @@ +GPU is available +SGD with lr=0.05 +Epoch [1/5], Step [100/600], Loss: 1.4742 +Epoch [1/5], Step [200/600], Loss: 0.6716 +Epoch [1/5], Step [300/600], Loss: 0.3644 +Epoch [1/5], Step [400/600], Loss: 0.4390 +Epoch [1/5], Step [500/600], Loss: 0.3300 +Epoch [1/5], Step [600/600], Loss: 0.3280 +Epoch [2/5], Step [100/600], Loss: 0.5176 +Epoch [2/5], Step [200/600], Loss: 0.3191 +Epoch [2/5], Step [300/600], Loss: 0.3006 +Epoch [2/5], Step [400/600], Loss: 0.3043 +Epoch [2/5], Step [500/600], Loss: 0.2298 +Epoch [2/5], Step [600/600], Loss: 0.2984 +Epoch [3/5], Step [100/600], Loss: 0.0901 +Epoch [3/5], Step [200/600], Loss: 0.3639 +Epoch [3/5], Step [300/600], Loss: 0.2157 +Epoch [3/5], Step [400/600], Loss: 0.3086 +Epoch [3/5], Step [500/600], Loss: 0.1639 +Epoch [3/5], Step [600/600], Loss: 0.1208 +Epoch [4/5], Step [100/600], Loss: 0.2894 +Epoch [4/5], Step [200/600], Loss: 0.1108 +Epoch [4/5], Step [300/600], Loss: 0.1534 +Epoch [4/5], Step [400/600], Loss: 0.2088 +Epoch [4/5], Step [500/600], Loss: 0.2160 +Epoch [4/5], Step [600/600], Loss: 0.1488 +Epoch [5/5], Step [100/600], Loss: 0.1651 +Epoch [5/5], Step [200/600], Loss: 0.1749 +Epoch [5/5], Step [300/600], Loss: 0.1752 +Epoch [5/5], Step [400/600], Loss: 0.1696 +Epoch [5/5], Step [500/600], Loss: 0.1583 +Epoch [5/5], Step [600/600], Loss: 0.1383 +Accuracy of the network on the 10000 test images: 95.46 % +SGD with Heavy Ball with lr=0.03, momentum_factor=0.9 +Epoch [1/5], Step [100/600], Loss: 0.4187 +Epoch [1/5], Step [200/600], Loss: 0.2630 +Epoch [1/5], Step [300/600], Loss: 0.3553 +Epoch [1/5], Step [400/600], Loss: 0.1421 +Epoch [1/5], Step [500/600], Loss: 0.2827 +Epoch [1/5], Step [600/600], Loss: 0.3259 +Epoch [2/5], Step [100/600], Loss: 0.1240 +Epoch [2/5], Step [200/600], Loss: 0.1328 +Epoch [2/5], Step [300/600], Loss: 0.1717 +Epoch [2/5], Step [400/600], Loss: 0.1230 +Epoch [2/5], Step [500/600], Loss: 0.1293 +Epoch [2/5], Step [600/600], Loss: 0.1465 +Epoch [3/5], Step [100/600], Loss: 0.1492 +Epoch [3/5], Step [200/600], Loss: 0.0796 +Epoch [3/5], Step [300/600], Loss: 0.1309 +Epoch [3/5], Step [400/600], Loss: 0.1431 +Epoch [3/5], Step [500/600], Loss: 0.0291 +Epoch [3/5], Step [600/600], Loss: 0.0153 +Epoch [4/5], Step [100/600], Loss: 0.0797 +Epoch [4/5], Step [200/600], Loss: 0.0354 +Epoch [4/5], Step [300/600], Loss: 0.0437 +Epoch [4/5], Step [400/600], Loss: 0.0369 +Epoch [4/5], Step [500/600], Loss: 0.0259 +Epoch [4/5], Step [600/600], Loss: 0.0417 +Epoch [5/5], Step [100/600], Loss: 0.0575 +Epoch [5/5], Step [200/600], Loss: 0.0389 +Epoch [5/5], Step [300/600], Loss: 0.0445 +Epoch [5/5], Step [400/600], Loss: 0.0387 +Epoch [5/5], Step [500/600], Loss: 0.0969 +Epoch [5/5], Step [600/600], Loss: 0.0322 +Accuracy of the network on the 10000 test images: 97.89 % +SGD with Nesterov with lr=0.01, momentum_factor=0.9 +Epoch [1/5], Step [100/600], Loss: 2.1211 +Epoch [1/5], Step [200/600], Loss: 0.5482 +Epoch [1/5], Step [300/600], Loss: 0.5049 +Epoch [1/5], Step [400/600], Loss: 0.2170 +Epoch [1/5], Step [500/600], Loss: 0.3352 +Epoch [1/5], Step [600/600], Loss: 0.2422 +Epoch [2/5], Step [100/600], Loss: 0.1738 +Epoch [2/5], Step [200/600], Loss: 0.2192 +Epoch [2/5], Step [300/600], Loss: 0.1178 +Epoch [2/5], Step [400/600], Loss: 0.2663 +Epoch [2/5], Step [500/600], Loss: 0.1225 +Epoch [2/5], Step [600/600], Loss: 0.0401 +Epoch [3/5], Step [100/600], Loss: 0.0568 +Epoch [3/5], Step [200/600], Loss: 0.1573 +Epoch [3/5], Step [300/600], Loss: 0.0705 +Epoch [3/5], Step [400/600], Loss: 0.2491 +Epoch [3/5], Step [500/600], Loss: 0.0589 +Epoch [3/5], Step [600/600], Loss: 0.4658 +Epoch [4/5], Step [100/600], Loss: 0.0818 +Epoch [4/5], Step [200/600], Loss: 0.1055 +Epoch [4/5], Step [300/600], Loss: 0.0510 +Epoch [4/5], Step [400/600], Loss: 0.1572 +Epoch [4/5], Step [500/600], Loss: 0.1439 +Epoch [4/5], Step [600/600], Loss: 0.7126 +Epoch [5/5], Step [100/600], Loss: 0.3807 +Epoch [5/5], Step [200/600], Loss: 0.2038 +Epoch [5/5], Step [300/600], Loss: 0.4482 +Epoch [5/5], Step [400/600], Loss: 0.2462 +Epoch [5/5], Step [500/600], Loss: 0.3246 +Epoch [5/5], Step [600/600], Loss: 0.1234 +Accuracy of the network on the 10000 test images: 92.0 % +Adam with lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08 +Epoch [1/5], Step [100/600], Loss: 0.2376 +Epoch [1/5], Step [200/600], Loss: 0.3365 +Epoch [1/5], Step [300/600], Loss: 0.2056 +Epoch [1/5], Step [400/600], Loss: 0.1639 +Epoch [1/5], Step [500/600], Loss: 0.1574 +Epoch [1/5], Step [600/600], Loss: 0.1961 +Epoch [2/5], Step [100/600], Loss: 0.0914 +Epoch [2/5], Step [200/600], Loss: 0.1725 +Epoch [2/5], Step [300/600], Loss: 0.1098 +Epoch [2/5], Step [400/600], Loss: 0.0807 +Epoch [2/5], Step [500/600], Loss: 0.1363 +Epoch [2/5], Step [600/600], Loss: 0.0472 +Epoch [3/5], Step [100/600], Loss: 0.0556 +Epoch [3/5], Step [200/600], Loss: 0.0572 +Epoch [3/5], Step [300/600], Loss: 0.0629 +Epoch [3/5], Step [400/600], Loss: 0.0777 +Epoch [3/5], Step [500/600], Loss: 0.0436 +Epoch [3/5], Step [600/600], Loss: 0.0775 +Epoch [4/5], Step [100/600], Loss: 0.1248 +Epoch [4/5], Step [200/600], Loss: 0.1043 +Epoch [4/5], Step [300/600], Loss: 0.0586 +Epoch [4/5], Step [400/600], Loss: 0.1358 +Epoch [4/5], Step [500/600], Loss: 0.0816 +Epoch [4/5], Step [600/600], Loss: 0.0265 +Epoch [5/5], Step [100/600], Loss: 0.0430 +Epoch [5/5], Step [200/600], Loss: 0.0089 +Epoch [5/5], Step [300/600], Loss: 0.0333 +Epoch [5/5], Step [400/600], Loss: 0.0156 +Epoch [5/5], Step [500/600], Loss: 0.0098 +Epoch [5/5], Step [600/600], Loss: 0.0460 +Accuracy of the network on the 10000 test images: 97.94 % diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/main.typ" "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/main.typ" index 93c2da6..4358cc5 100644 --- "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/main.typ" +++ "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/main.typ" @@ -686,4 +686,71 @@ === 卷积神经网络 深度学习的发展受到计算机视觉研究的推动,卷积神经网络是其中的代表,它受到了对人类视觉神经元研究的启发。 + 卷积的想法是,我们将数据的某个局部看作相互联系的整体,该局部应该以某种方式共享参数。这样在实现上,也可以极大地减少需要存储的参数数量。实践上,我们用某个待学习的卷积核对数据的某个局部进行卷积,从而得到新的数据。 + + 同时,卷积的形式具有很好的平移不变性,这也在很多时候体现了数据的特征。 + + 卷积网络中还有一个技巧称为 pooling 或者 downsampling,它的目的是减少数据的维度,同时保留数据的主要特征。它相当于对于每个 Channel 的分块做一个集中操作,常见的包括 max pooling (取最大值)和 average pooling(取平均值)。 + + 经典的卷积神经网络包括 AlexNet,采用了 ReLU 激活函数和 GPU 加速,在 ImageNet 上取得了巨大的成功。VGGNet 采用了更深的网络结构,ResNet 采用了残差连接,将上一层信号的一部分直接传递到下一层,要学习的目标变成了信号的变化量。理论上,这个 trick 并不改变网络的表达能力,但极大地加速了训练过程,使得更深的网络变得可能。 + === 递归神经网络 + 对于序列化的数据,例如文本,我们往往希望能够保留数据的顺序信息。递归神经网络是一种很好的选择。抽象来说,给定输入 $x_i$ 输出 $y_i$,希望训练一个模型 $H_t$ 使得: + $ + y_t = H_t (x_1, x_2, ..., x_t) + $ + 大致思路是: + - 设定一个额外的隐藏状态 $h_t$,使得 $h_t = f(x_t, h_(t - 1))$,这个 $f$ 是 $t-$无关的。 + - $y_t = H (x_t, h_(t - 1))$ + 但这会导致对早期信号的记忆很差。一个重要的改进是 LSTM(Long Short Term Memory),它认为之前的 $h$ 是短期记忆,同时引入了一个额外的长期记忆单元,使得网络能够更好地处理长期依赖问题。同时,引入了 Gate 机制,让网络决定是否更新长期记忆。 + == 神经网络的训练 + 抽象来说,神经网络的训练是一个优化问题: + $ + min_theta 1/n sum_i l(f(x_i, theta), y_i) + lambda norm(theta) + $ + 然而,$f$ 往往具有以下特点: + - 非常复杂,连梯度都并不容易计算 + - 非凸 + - 维度很高 + 因此训练神经网络时,往往会使用 GPU 进行并行化计算。然而,深度学习中,宽度往往是完美并行的,然而深度却产生了数据依赖,无法简单的并行化。这是制约网络变深的一个重要原因。 + + 同时,我们往往会遇到其他重要问题,包括: + - 不同层的梯度规模差距很大,导致作为优化问题看待时,条件数很大。经验上,靠近输出层的梯度很大,靠近输入层的梯度很小,很容易造成梯度爆炸或者梯度消失。这也是为什么 sigmoid 等饱和的激活函数不受欢迎,因为它们会导致梯度消失。 + === 学习率选择 + 为了解决梯度消失问题,另一种思路是,为梯度一直较小的模型设计一个较大的学习率,而为梯度较大的模型设计一个较小的学习率。这就是 Adagrad 的思路,它的更新规则是考虑该参数的累计梯度 $G_t = sum_t g_t^2$,并且在下一步中采用学习率: + $ + eta/(sqrt(G_t) + epsilon) + $ + 其中 $epsilon$ 是防止除零的常数。 + + 然而,经验上学习率一直降低并不是一个好的策略。因此已经被彻底抛弃了。 + + 另一种思路是 rProp,它的更新规则是直接抛弃梯度的大小信息,直接根据梯度的符号来更新参数。然而,当 batch 比较小并采用随机梯度下降时,这种方法往往完全无法使用。 + + 它的一个改进是 RMSProp,它的更新规则是: + $ + v_(t + 1) = beta v_t + (1 - beta) g_t^2\ + theta_(t + 1) = theta_t - eta g_t/(sqrt(v_(t + 1) + epsilon)) + $ + + 最常用的算法是 Adam,它是以上几种方法的改进。具体而言,它的更新规则是: + - 计算梯度的一阶 momentum 和 二阶 momentum: + $ + m_(t + 1) = beta_1 m_t + (1 - beta_1) g_t\ + v_(t + 1) = beta_2 v_t + (1 - beta_2) g_t^2 + $ + - 做 bias correction: + $ + m_(t + 1) = m_(t + 1)/(1 - beta_1^t)\ + v_(t + 1) = v_(t + 1)/(1 - beta_2^t) + $ + 这是为了解决 $0-$初始化产生的问题。 + - 更新参数: + $ + theta_(t + 1) = theta_t - eta m_(t + 1)/(sqrt(v_(t + 1) + epsilon)) + $ + 它已经成为了训练较大规模神经网络的标准方法。 + === Normaliztion + 另一个重要的问题是梯度消失。一种解决方法是 Batch Normalization,它的思路是对每个 batch 的数据进行归一化,使得每个维度的数据均值为 $0$,方差为 $1$。尽管原因未知,实践表明这样可以极大地增强泛化能力。 + + 另一种常见方法是 drop out,也就是随机选择一些神经元不参与训练。通过一些计算可以证明,这样的效果相当于添加一个正则化项。 = 理论基础 diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/\344\275\234\344\270\232/hw6.typ" "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/\344\275\234\344\270\232/hw6.typ" new file mode 100644 index 0000000..debd025 --- /dev/null +++ "b/\346\234\272\345\231\250\345\255\246\344\271\240\346\225\260\345\255\246\345\257\274\345\274\225/\344\275\234\344\270\232/hw6.typ" @@ -0,0 +1,141 @@ +#import "../../template.typ": * +#import "../main.typ": * +#show: note.with( + title: "作业6", + author: "YHTQ", + date: datetime.today().display(), + logo: none, + withOutlined : false, + withTitle : false, + withHeadingNumbering: true +) += #empty + == #empty + 注意到: + $ + t(x) = + cases( + 1 + x quad -1 < x <= 0, + 1 - x quad 0 < x < 1, + 0 quad "otherwise" + ) + $ + 因此: + $ + t(x) = (sigma(1 - x) - sigma(-x)) + (sigma(1 + x) - sigma(x)) - 1 + $ + 进而 $P_M f$ 可以被两层 RELU 网络表示 + == #empty + 考虑: + $ + abs(f(x) - P_M f(x)) = abs(f(x) - sum_k f(x_k) t((x - x_k) / h))\ + $ + 假设 $x in [x_(k - 1), x_k]$,则上式等于: + $ + abs(f(x) - f(x_(k - 1)) t((x - x_(k - 1)) / h) - f(x_k) t((x - x_k) / h))\ + = abs(f(x) - f(x_(k - 1)) ((x_k - x) / h) - f(x_k) ((x - x_(k - 1)) / h))\ + = abs(((x_k - x) / h) (f(x) - f(x_(k - 1))) + ((x - x_(k - 1)) / h) (f(x) - f(x_k)))\ + <= abs(((x_k - x) / h) (f(x) - f(x_(k - 1)))) + abs(((x - x_(k - 1)) / h) (f(x) - f(x_k)))\ + <= abs( (f(x) - f(x_(k - 1)))) + abs((f(x) - f(x_k)))\ + $ + 因此,对于任何 $epsilon > 0$,由连续函数的一致连续性,取 $delta$ 使得 $abs(x - y) < delta => abs(f(x) - f(x)) < epsilon$,令 $M$ 充分大使得 $h < delta$,立刻有: + $ + abs(f(x) - P_M f(x)) < 2 epsilon + $ + == #empty + 对于 $f(x) = x^2$,做类似估计有: + $ + abs(f(x) - P_M f(x)) <= abs(((x_k - x) / h) (x^2 - x_(k - 1)^2) + ((x - x_(k - 1)) / h) (x^2 - x_k^2))\ + = 1/h (x - x_(k - 1)) (x_k - x) h\ + <= h^2/4 + $ + 因此只需: + $ + 1/(4 M^2) < epsilon\ + M > 1/(2 sqrt(epsilon)) + $ += #empty + == #empty + #let fs = $f^*$ + 上一题已经证明: + $ + abs(P_(2^l) fs (x) - fs(x)) < (1/2^l)^2/4 = 1/4 2^(-2l) + $ + == #empty + 用归纳法,假设: + $ + x in [m/(2^l), (m + 1)/(2^l)]\ + $ + 简略起见,设 $m = 4k$(其他情形类似),则: + $ + x in [k/(2^(l - 2)), (k + 1)/(2^(l - 2))]\ + $ + 归纳假设给出: + $ + P_(2^(l-2)) fs(x) - P_(2^(l - 1)) fs(x) = (g_(l-1) (x))/(2^(2(l-1))) + $ + 同时: + $ + P_(2^(l-2)) fs(x) - P_(2^(l - 1)) fs(x) \ + = (k/(2^(l - 2)))^2 ((k + 1) - 2^(l - 2) x) + ((k + 1)/(2^(l - 2)))^2 (2^(l - 2) x - k) - (k/(2^(l - 2)))^2 ((2k + 1) - 2^(l - 1) x) - ((2k + 1)/(2^(l - 1)))^2 (2^(l - 1) x - 2 k)\ + $ + 换言之: + $ + g_(l - 1) (x) =(2 k)^2 ((k + 1) - 2^(l - 2) x) + (2 k + 2)^2 (2^(l-2) x - k) - (2 k)^2 ((2k + 1) - 2^(l-1) x) - (2k + 1)^2 (2^(l-1) x - 2 k)\ + = (2 k)^2 (2^(l-2)x - k) + (2^(l-2) x - k) ((2 k + 2)^2 - 2 (2 k + 1)^2)\ + = 2 (2^(l-2)x - k)\ + <= 2 (2^(l-2) (4 k + 1)/(2^l) - k) = 1/2 + $ + 因此: + $ + g_l (x) = max(0, 1 - abs(2 g_(l - 1) (x) - 1))\ + = max(0, 2 g_(l - 1) (x))\ + = 2 (2^(l-1)x - 2 k) + $ + 由类似的推导,它就是: + $ + P_(2^(l-1)) fs(x) - P_(2^(l)) fs(x) + $ + == #empty + 显然 $g(x) = t(2 x - 1)$ 是两层 RELU 网络,因此 $g_l (x)$ 可以表示为 $2 l$ 层 RELU 网络 += #empty + == #empty + #let hR = $hat(cal(R))$ + #let hRd = $hR_"drop"$ + #let mask = $dot.circle$ + $ + hRd = E (1/(2n) sum_(i = 1)^n ( (beta mask xi)^T x_i - y_i^2))\ + = 1/(2n) sum_(i = 1)^n E ( (beta mask xi)^T x_i - y_i)^2\ + = 1/(2n) sum_(i = 1)^n E ( (beta mask xi)^T x_i - beta^T x + p beta^T x - y_i)^2\ + = 1/(2n) sum_(i = 1)^n E ( (beta mask xi)^T x_i - beta^T x)^2 + (p beta^T x - y_i)^2\ + = hR(hat(beta)) + 1/(2n) sum_(i = 1)^n D ( (beta mask xi)^T x_i)^2 \ + = hR(hat(beta)) + 1/(2n) sum_(i = 1)^n D (sum_d xi_d beta_d x_(i, d))^2 \ + = hR(hat(beta)) + 1/(2n) sum_(i = 1)^n sum_d D(xi_d beta_d x_(i, d))^2 \ + = hR(hat(beta)) + 1/(2n) sum_(i = 1)^n sum_k (1-p)/p (p beta_k)^2 (x_(i, k))^2 \ + = hR(hat(beta)) + 1/(2) sum_k (1-p)/p (p beta_k)^2 1/n sum_(i = 1)^n (x_(i, k))^2 \ + = hR(hat(beta)) + (1 - p)/(2 p) sum_k omega_k tilde(beta)_j^2 \ + // = 1/(2n) sum_(i = 1)^n E ( (beta mask xi)^T x_i - p beta^T x)^2 + (p beta^T x - y_i)^2\ + // = 1/(2n) sum_(i = 1)^n D ( (beta mask xi)^T x_i) + (p beta^T x - y_i)^2\ + // = 1/(2n) sum_(i = 1)^n D ( sum_k xi_k beta_k x_(i, k) ) + (p beta^T x - y_i)^2\ + // = 1/(2n) sum_(i = 1)^n sum_k D (xi_k) (beta_k x_(i, k))^2 + (p beta^T x - y_i)^2\ + // = 1/(2n) sum_(i = 1)^n sum_k p (1-p) (beta_k x_(i, k))^2 + (p beta^T x - y_i)^2\ + // = 1/(2n) sum_(i = 1)^n sum_k p (1-p) (beta_k x_(i, k))^2 + (p beta^T x - beta^T x + beta^T x - y_i)^2\ + // = 1/(2n) sum_(i = 1)^n sum_k p (1-p) (beta_k x_(i, k))^2 + (p-1)^2 (beta^T x)^2 + 2 (p - 1) beta^T x (beta^T x - y_i) + (beta^T x - y_i)^2\ + // = hR(beta) + 1/(2n) sum_(i = 1)^n sum_k p (1-p) (beta_k x_(i, k))^2 + (p-1)^2 (beta^T x)^2 + 2 (p - 1) beta^T x (beta^T x - y_i)\ + + $ + // 对于某个 $x$,假设 $x in [u, u']$,其中 $u$ 是 $M = 2^(l - 1)$ 时划分点。 + // - 假设 $x in [u, v], v = (u + u') / 2$ 是 $M = 2^l$ 时划分点,有: + // $ + // P_(2^(l - 1)) fs(x) - P_(2^l) fs(x) = u^2 t((x - u) / h) + u'^2 t((x - u') / h) - u^2 t((x - u) / (2 h)) - v^2 t((x - v) / (2 h))\ + // = u^2 ((u' - x) / h) + u'^2 ((x - u) / h) - u^2 ((v - x) / (2 h)) - v^2 ((x - u) / (2 h))\ + // $ + // 归纳假设给出: + // $ + // P_(2^(l - 2)) fs(x) - P_(2^(l - 1)) fs(x) = u^2 ((u' - x) / h) + u'^2 ((x - u) / h) - u^2 ((v - x) / (2 h)) - v^2 ((x - u) / (2 h))\ + // $ += #empty + 代码为: + #raw(read("../code6/src/code6/main.py"), lang: "python", block: true) + 输出为: + #raw(read("../code6/train_output"), lang: "text", block: true) \ No newline at end of file