diff --git a/contents/core/frameworks/frameworks.bib b/contents/core/frameworks/frameworks.bib index 3bc1c371..7960ec5e 100644 --- a/contents/core/frameworks/frameworks.bib +++ b/contents/core/frameworks/frameworks.bib @@ -1,246 +1,274 @@ %comment{This file was created with betterbib v5.0.11.} - @inproceedings{abadi2016tensorflow, - author = {Yu, Yuan and Abadi, Mart{\'\i}n and Barham, Paul and Brevdo, Eugene and Burrows, Mike and Davis, Andy and Dean, Jeff and Ghemawat, Sanjay and Harley, Tim and Hawkins, Peter and Isard, Michael and Kudlur, Manjunath and Monga, Rajat and Murray, Derek and Zheng, Xiaoqiang}, - booktitle = {Proceedings of the Thirteenth EuroSys Conference}, - doi = {10.1145/3190508.3190551}, - pages = {265--283}, - publisher = {ACM}, - source = {Crossref}, - title = {Dynamic control flow in large-scale machine learning}, - url = {https://doi.org/10.1145/3190508.3190551}, - year = {2018}, - month = apr, -} - -@misc{al2016theano, - author = {Team, The Theano Development and Al-Rfou, Rami and Alain, Guillaume and Almahairi, Amjad and Angermueller, Christof and Bahdanau, Dzmitry and Ballas, Nicolas and Bastien, Fr\'ed\'eric and Bayer, Justin and Belikov, Anatoly and Belopolsky, Alexander and Bengio, Yoshua and Bergeron, Arnaud and Bergstra, James and Bisson, Valentin and Snyder, Josh Bleecher and Bouchard, Nicolas and Boulanger-Lewandowski, Nicolas and Bouthillier, Xavier and de Br\'ebisson, Alexandre and Breuleux, Olivier and Carrier, Pierre-Luc and Cho, Kyunghyun and Chorowski, Jan and Christiano, Paul and Cooijmans, Tim and C\^ot\'e, Marc-Alexandre and C\^ot\'e, Myriam and Courville, Aaron and Dauphin, Yann N. and Delalleau, Olivier and Demouth, Julien and Desjardins, Guillaume and Dieleman, Sander and Dinh, Laurent and Ducoffe, M\'elanie and Dumoulin, Vincent and Kahou, Samira Ebrahimi and Erhan, Dumitru and Fan, Ziye and Firat, Orhan and Germain, Mathieu and Glorot, Xavier and Goodfellow, Ian and Graham, Matt and Gulcehre, Caglar and Hamel, Philippe and Harlouchet, Iban and Heng, Jean-Philippe and Hidasi, Bal\'azs and Honari, Sina and Jain, Arjun and Jean, S\'ebastien and Jia, Kai and Korobov, Mikhail and Kulkarni, Vivek and Lamb, Alex and Lamblin, Pascal and Larsen, Eric and Laurent, C\'esar and Lee, Sean and Lefrancois, Simon and Lemieux, Simon and L\'eonard, Nicholas and Lin, Zhouhan and Livezey, Jesse A. and Lorenz, Cory and Lowin, Jeremiah and Ma, Qianli and Manzagol, Pierre-Antoine and Mastropietro, Olivier and McGibbon, Robert T. and Memisevic, Roland and van Merri\"enboer, Bart and Michalski, Vincent and Mirza, Mehdi and Orlandi, Alberto and Pal, Christopher and Pascanu, Razvan and Pezeshki, Mohammad and Raffel, Colin and Renshaw, Daniel and Rocklin, Matthew and Romero, Adriana and Roth, Markus and Sadowski, Peter and Salvatier, John and Savard, Fran\c{c}ois and Schl\"uter, Jan and Schulman, John and Schwartz, Gabriel and Serban, Iulian Vlad and Serdyuk, Dmitriy and Shabanian, Samira and Simon, \'Etienne and Spieckermann, Sigurd and Subramanyam, S. Ramana and Sygnowski, Jakub and Tanguay, J\'er\'emie and van Tulder, Gijs and Turian, Joseph and Urban, Sebastian and Vincent, Pascal and Visin, Francesco and de Vries, Harm and Warde-Farley, David and Webb, Dustin J. and Willson, Matthew and Xu, Kelvin and Xue, Lijun and Yao, Li and Zhang, Saizheng and Zhang, Ying}, - archiveprefix = {arXiv}, - eprint = {1605.02688}, - primaryclass = {cs.SC}, - title = {Theano: {A} Python framework for fast computation of mathematical expressions}, - year = {2016}, + doi = {10.1145/3190508.3190551}, + pages = {1--15}, + source = {Crossref}, + author = {Yu, Yuan and Abadi, Martín and Barham, Paul and Brevdo, Eugene and Burrows, Mike and Davis, Andy and Dean, Jeff and Ghemawat, Sanjay and Harley, Tim and Hawkins, Peter and Isard, Michael and Kudlur, Manjunath and Monga, Rajat and Murray, Derek and Zheng, Xiaoqiang}, + date = {2018-04-23}, + url = {https://doi.org/10.1145/3190508.3190551}, + booktitle = {Proceedings of the Thirteenth EuroSys Conference}, + publisher = {ACM}, + title = {Dynamic control flow in large-scale machine learning}, +} + +@article{al2016theano, + url = {http://arxiv.org/abs/1605.02688v1}, + date = {2016-05-09}, + title = {Theano: A Python framework for fast computation of mathematical expressions}, + author = {Team, The Theano Development and Al-Rfou, Rami and Alain, Guillaume and Almahairi, Amjad and Angermueller, Christof and Bahdanau, Dzmitry and Ballas, Nicolas and Bastien, Frédéric and Bayer, Justin and Belikov, Anatoly and Belopolsky, Alexander and Bengio, Yoshua and Bergeron, Arnaud and Bergstra, James and Bisson, Valentin and Snyder, Josh Bleecher and Bouchard, Nicolas and Boulanger-Lewandowski, Nicolas and Bouthillier, Xavier and de Brébisson, Alexandre and Breuleux, Olivier and Carrier, Pierre-Luc and Cho, Kyunghyun and Chorowski, Jan and Christiano, Paul and Cooijmans, Tim and Côté, Marc-Alexandre and Côté, Myriam and Courville, Aaron and Dauphin, Yann N. and Delalleau, Olivier and Demouth, Julien and Desjardins, Guillaume and Dieleman, Sander and Dinh, Laurent and Ducoffe, Mélanie and Dumoulin, Vincent and Kahou, Samira Ebrahimi and Erhan, Dumitru and Fan, Ziye and Firat, Orhan and Germain, Mathieu and Glorot, Xavier and Goodfellow, Ian and Graham, Matt and Gulcehre, Caglar and Hamel, Philippe and Harlouchet, Iban and Heng, Jean-Philippe and Hidasi, Balázs and Honari, Sina and Jain, Arjun and Jean, Sébastien and Jia, Kai and Korobov, Mikhail and Kulkarni, Vivek and Lamb, Alex and Lamblin, Pascal and Larsen, Eric and Laurent, César and Lee, Sean and Lefrancois, Simon and Lemieux, Simon and Léonard, Nicholas and Lin, Zhouhan and Livezey, Jesse A. and Lorenz, Cory and Lowin, Jeremiah and Ma, Qianli and Manzagol, Pierre-Antoine and Mastropietro, Olivier and McGibbon, Robert T. and Memisevic, Roland and van Merriënboer, Bart and Michalski, Vincent and Mirza, Mehdi and Orlandi, Alberto and Pal, Christopher and Pascanu, Razvan and Pezeshki, Mohammad and Raffel, Colin and Renshaw, Daniel and Rocklin, Matthew and Romero, Adriana and Roth, Markus and Sadowski, Peter and Salvatier, John and Savard, François and Schlüter, Jan and Schulman, John and Schwartz, Gabriel and Serban, Iulian Vlad and Serdyuk, Dmitriy and Shabanian, Samira and Simon, Étienne and Spieckermann, Sigurd and Subramanyam, S. Ramana and Sygnowski, Jakub and Tanguay, Jérémie and van Tulder, Gijs and Turian, Joseph and Urban, Sebastian and Vincent, Pascal and Visin, Francesco and de Vries, Harm and Warde-Farley, David and Webb, Dustin J. and Willson, Matthew and Xu, Kelvin and Xue, Lijun and Yao, Li and Zhang, Saizheng and Zhang, Ying}, + primaryclass = {cs.SC}, + archiveprefix = {arXiv}, + eprint = {1605.02688}, } @inproceedings{brown2020language, - author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, - editor = {Larochelle, Hugo and Ranzato, Marc'Aurelio and Hadsell, Raia and Balcan, Maria-Florina and Lin, Hsuan-Tien}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/nips/BrownMRSKDNSSAA20.bib}, - booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual}, - timestamp = {Tue, 19 Jan 2021 00:00:00 +0100}, - title = {Language Models are Few-Shot Learners}, - url = {https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html}, - year = {2020}, + author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, + editor = {Larochelle, Hugo and Ranzato, Marc'Aurelio and Hadsell, Raia and Balcan, Maria-Florina and Lin, Hsuan-Tien}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/nips/BrownMRSKDNSSAA20.bib}, + booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual}, + timestamp = {Tue, 19 Jan 2021 00:00:00 +0100}, + title = {Language Models are Few-Shot Learners}, + url = {https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html}, + year = {2020}, +} + +@article{Baydin_2018, + author = {Baydin, Atilim Gunes and Pearlmutter, Barak A. and Radul, Alexey Andreyevich and Siskind, Jeffrey Mark}, + title = {Automatic Differentiation in Machine Learning: a Survey.}, + journal = {J. Mach. Learn. Res.}, + volume = {18}, + pages = {153:1-153:43}, + year = {2017}, + url = {https://jmlr.org/papers/v18/17-468.html}, + source = {DBLP}, +} + +@inproceedings{Abadi_2016, + author = {Abadi, Martín and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and others}, + title = {TensorFlow: A System for Large-Scale Machine Learning}, + booktitle = {12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)}, + pages = {265--283}, + year = {2016}, + publisher = {USENIX Association}, + url = {https://www.usenix.org/conference/osdi16/technical-sessions/presentation/abadi}, } @article{chollet2018keras, - author = {Chollet, Fran\c{c}ois}, - journal = {March 9th}, - title = {Introduction to keras}, - year = {2018}, + author = {Chollet, François}, + journal = {March 9th}, + title = {Introduction to keras}, + year = {2018}, } @article{david2021tensorflow, - author = {David, Robert and Duke, Jared and Jain, Advait and Janapa Reddi, Vijay and Jeffries, Nat and Li, Jian and Kreeger, Nick and Nappier, Ian and Natraj, Meghna and Wang, Tiezhen and others}, - journal = {Proceedings of Machine Learning and Systems}, - pages = {800--811}, - title = {Tensorflow lite micro: {Embedded} machine learning for tinyml systems}, - volume = {3}, - year = {2021}, + author = {David, Robert and Duke, Jared and Jain, Advait and Janapa Reddi, Vijay and Jeffries, Nat and Li, Jian and Kreeger, Nick and Nappier, Ian and Natraj, Meghna and Wang, Tiezhen and others}, + journal = {Proceedings of Machine Learning and Systems}, + pages = {800--811}, + title = {Tensorflow lite micro: Embedded machine learning for tinyml systems}, + volume = {3}, + year = {2021}, } @inproceedings{dean2012large, - author = {Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai and Devin, Matthieu and Le, Quoc V. and Mao, Mark Z. and Ranzato, Marc'Aurelio and Senior, Andrew W. and Tucker, Paul A. and Yang, Ke and Ng, Andrew Y.}, - editor = {Bartlett, Peter L. and Pereira, Fernando C. N. and Burges, Christopher J. C. and Bottou, L\'eon and Weinberger, Kilian Q.}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/nips/DeanCMCDLMRSTYN12.bib}, - booktitle = {Advances in Neural Information Processing Systems 25: 26th Annual Conference on Neural Information Processing Systems 2012. Proceedings of a meeting held December 3-6, 2012, Lake Tahoe, Nevada, United States}, - pages = {1232--1240}, - timestamp = {Thu, 21 Jan 2021 00:00:00 +0100}, - title = {Large Scale Distributed Deep Networks}, - url = {https://proceedings.neurips.cc/paper/2012/hash/6aca97005c68f1206823815f66102863-Abstract.html}, - year = {2012}, + author = {Dean, Jeffrey and Corrado, Greg and Monga, Rajat and 0010, Kai Chen and Devin, Matthieu and Le, Quoc V. and Mao, Mark Z. and Ranzato, Marc'Aurelio and Senior, Andrew W. and Tucker, Paul A. and Yang, Ke and Ng, Andrew Y.}, + title = {Large Scale Distributed Deep Networks.}, + journal = {NIPS}, + pages = {1232--1240}, + year = {2012}, + url = {https://proceedings.neurips.cc/paper/2012/hash/6aca97005c68f1206823815f66102863-Abstract.html}, + source = {DBLP}, + editor = {Bartlett, Peter L. and Pereira, Fernando C. N. and Burges, Christopher J. C. and Bottou, Léon and Weinberger, Kilian Q.}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/nips/DeanCMCDLMRSTYN12.bib}, + booktitle = {Advances in Neural Information Processing Systems 25: 26th Annual Conference on Neural Information Processing Systems 2012. Proceedings of a meeting held December 3-6, 2012, Lake Tahoe, Nevada, United States}, + timestamp = {Thu, 21 Jan 2021 00:00:00 +0100}, } @inproceedings{deng2009imagenet, - author = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Li, Fei-Fei}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/cvpr/DengDSLL009.bib}, - booktitle = {2009 IEEE Conference on Computer Vision and Pattern Recognition}, - doi = {10.1109/cvpr.2009.5206848}, - pages = {248--255}, - publisher = {IEEE}, - timestamp = {Fri, 27 Mar 2020 00:00:00 +0100}, - title = {{ImageNet:} {A} large-scale hierarchical image database}, - url = {https://doi.org/10.1109/cvpr.2009.5206848}, - year = {2009}, - source = {Crossref}, - month = jun, + doi = {10.1109/cvpr.2009.5206848}, + source = {Crossref}, + author = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei}, + date = {2009-06}, + url = {https://doi.org/10.1109/cvpr.2009.5206848}, + booktitle = {2009 IEEE Conference on Computer Vision and Pattern Recognition}, + publisher = {IEEE}, + title = {ImageNet: A large-scale hierarchical image database}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/cvpr/DengDSLL009.bib}, + pages = {248--255}, + timestamp = {Fri, 27 Mar 2020 00:00:00 +0100}, } @inproceedings{he2016deep, - author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/cvpr/HeZRS16.bib}, - booktitle = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, - doi = {10.1109/cvpr.2016.90}, - pages = {770--778}, - publisher = {IEEE}, - timestamp = {Wed, 17 Apr 2019 01:00:00 +0200}, - title = {Deep Residual Learning for Image Recognition}, - url = {https://doi.org/10.1109/cvpr.2016.90}, - year = {2016}, - source = {Crossref}, - month = jun, + doi = {10.1109/cvpr.2016.90}, + source = {Crossref}, + author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + date = {2016-06}, + url = {https://doi.org/10.1109/cvpr.2016.90}, + booktitle = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + publisher = {IEEE}, + title = {Deep Residual Learning for Image Recognition}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/cvpr/HeZRS16.bib}, + pages = {770--778}, + timestamp = {Wed, 17 Apr 2019 01:00:00 +0200}, } @inproceedings{jia2014caffe, - author = {Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor}, - booktitle = {Proceedings of the 22nd ACM international conference on Multimedia}, - doi = {10.1145/2647868.2654889}, - pages = {675--678}, - publisher = {ACM}, - source = {Crossref}, - subtitle = {Convolutional Architecture for Fast Feature Embedding}, - title = {Caffe}, - url = {https://doi.org/10.1145/2647868.2654889}, - year = {2014}, - month = nov, -} - -@inproceedings{krizhevsky2012imagenet, - author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.}, - editor = {Bartlett, Peter L. and Pereira, Fernando C. N. and Burges, Christopher J. C. and Bottou, L\'eon and Weinberger, Kilian Q.}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/nips/KrizhevskySH12.bib}, - booktitle = {Advances in Neural Information Processing Systems 25: 26th Annual Conference on Neural Information Processing Systems 2012. Proceedings of a meeting held December 3-6, 2012, Lake Tahoe, Nevada, United States}, - pages = {1106--1114}, - timestamp = {Thu, 21 Jan 2021 00:00:00 +0100}, - title = {{ImageNet} Classification with Deep Convolutional Neural Networks}, - url = {https://proceedings.neurips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html}, - year = {2012}, + doi = {10.1145/2647868.2654889}, + source = {Crossref}, + author = {Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor}, + subtitle = {Convolutional Architecture for Fast Feature Embedding}, + date = {2014-11-03}, + url = {https://doi.org/10.1145/2647868.2654889}, + booktitle = {Proceedings of the 22nd ACM international conference on Multimedia}, + publisher = {ACM}, + title = {Caffe}, + pages = {675--678}, +} + +@article{krizhevsky2012imagenet, + number = {6}, + doi = {10.1145/3065386}, + pages = {84--90}, + source = {Crossref}, + volume = {60}, + author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.}, + date = {2017-05-24}, + url = {https://doi.org/10.1145/3065386}, + issn = {0001-0782,1557-7317}, + journal = {Communications of the ACM}, + publisher = {Association for Computing Machinery (ACM)}, + title = {ImageNet classification with deep convolutional neural networks}, + editor = {Bartlett, Peter L. and Pereira, Fernando C. N. and Burges, Christopher J. C. and Bottou, Léon and Weinberger, Kilian Q.}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/nips/KrizhevskySH12.bib}, + booktitle = {Advances in Neural Information Processing Systems 25: 26th Annual Conference on Neural Information Processing Systems 2012. Proceedings of a meeting held December 3-6, 2012, Lake Tahoe, Nevada, United States}, + timestamp = {Thu, 21 Jan 2021 00:00:00 +0100}, } @inproceedings{kung1979systolic, - author = {Kung, Hsiang Tsung and Leiserson, Charles E}, - booktitle = {Sparse Matrix Proceedings 1978}, - organization = {Society for industrial and applied mathematics Philadelphia, PA, USA}, - pages = {256--282}, - title = {Systolic arrays (for {VLSI)}}, - volume = {1}, - year = {1979}, + author = {Kung, Hsiang Tsung and Leiserson, Charles E}, + booktitle = {Sparse Matrix Proceedings 1978}, + organization = {Society for industrial and applied mathematics Philadelphia, PA, USA}, + pages = {256--282}, + title = {Systolic arrays (for VLSI)}, + volume = {1}, + year = {1979}, } @article{lai2018cmsis, - author = {Lai, Liangzhen and Suda, Naveen and Chandra, Vikas}, - journal = {ArXiv preprint}, - title = {Cmsis-nn: {Efficient} neural network kernels for arm cortex-m cpus}, - url = {https://arxiv.org/abs/1801.06601}, - volume = {abs/1801.06601}, - year = {2018}, + url = {http://arxiv.org/abs/1801.06601v1}, + date = {2018-01-19}, + title = {CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs}, + author = {Lai, Liangzhen and Suda, Naveen and Chandra, Vikas}, + primaryclass = {cs.NE}, + archiveprefix = {arXiv}, + journal = {ArXiv preprint}, + volume = {abs/1801.06601}, } @inproceedings{li2014communication, - author = {Li, Mu and Andersen, David G. and Smola, Alexander J. and Yu, Kai}, - editor = {Ghahramani, Zoubin and Welling, Max and Cortes, Corinna and Lawrence, Neil D. and Weinberger, Kilian Q.}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/nips/LiASY14.bib}, - booktitle = {Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014, December 8-13 2014, Montreal, Quebec, Canada}, - pages = {19--27}, - timestamp = {Thu, 21 Jan 2021 00:00:00 +0100}, - title = {Communication Efficient Distributed Machine Learning with the Parameter Server}, - url = {https://proceedings.neurips.cc/paper/2014/hash/1ff1de774005f8da13f42943881c655f-Abstract.html}, - year = {2014}, + author = {0003, Mu Li and Andersen, David G. and Smola, Alexander J. and Yu, Kai}, + title = {Communication Efficient Distributed Machine Learning with the Parameter Server.}, + journal = {NIPS}, + pages = {19--27}, + year = {2014}, + url = {https://proceedings.neurips.cc/paper/2014/hash/1ff1de774005f8da13f42943881c655f-Abstract.html}, + source = {DBLP}, + editor = {Ghahramani, Zoubin and Welling, Max and Cortes, Corinna and Lawrence, Neil D. and Weinberger, Kilian Q.}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/nips/LiASY14.bib}, + booktitle = {Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014, December 8-13 2014, Montreal, Quebec, Canada}, + timestamp = {Thu, 21 Jan 2021 00:00:00 +0100}, } @article{li2017learning, - author = {Li, Zhizhong and Hoiem, Derek}, - doi = {10.1109/tpami.2017.2773081}, - issn = {0162-8828, 2160-9292, 1939-3539}, - journal = {IEEE Trans. Pattern Anal. Mach. Intell.}, - number = {12}, - pages = {2935--2947}, - publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, - source = {Crossref}, - title = {Learning without Forgetting}, - url = {https://doi.org/10.1109/tpami.2017.2773081}, - volume = {40}, - year = {2018}, - month = dec, + number = {12}, + doi = {10.1109/tpami.2017.2773081}, + pages = {2935--2947}, + source = {Crossref}, + volume = {40}, + author = {Li, Zhizhong and Hoiem, Derek}, + date = {2018-12-01}, + url = {https://doi.org/10.1109/tpami.2017.2773081}, + issn = {0162-8828,2160-9292,1939-3539}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, + title = {Learning without Forgetting}, } @inproceedings{lin2020mcunet, - author = {Lin, Ji and Chen, Wei-Ming and Lin, Yujun and Cohn, John and Gan, Chuang and Han, Song}, - editor = {Larochelle, Hugo and Ranzato, Marc'Aurelio and Hadsell, Raia and Balcan, Maria-Florina and Lin, Hsuan-Tien}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/nips/LinCLCG020.bib}, - booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual}, - timestamp = {Thu, 11 Feb 2021 00:00:00 +0100}, - title = {{MCUNet:} {Tiny} Deep Learning on {IoT} Devices}, - url = {https://proceedings.neurips.cc/paper/2020/hash/86c51678350f656dcc7f490a43946ee5-Abstract.html}, - year = {2020}, + author = {Lin, Ji and Chen, Wei-Ming and Lin, Yujun and Cohn, John and Gan, Chuang and Han, Song}, + editor = {Larochelle, Hugo and Ranzato, Marc'Aurelio and Hadsell, Raia and Balcan, Maria-Florina and Lin, Hsuan-Tien}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/nips/LinCLCG020.bib}, + booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual}, + timestamp = {Thu, 11 Feb 2021 00:00:00 +0100}, + title = {MCUNet: Tiny Deep Learning on IoT Devices}, + url = {https://proceedings.neurips.cc/paper/2020/hash/86c51678350f656dcc7f490a43946ee5-Abstract.html}, + year = {2020}, } @inproceedings{mcmahan2023communicationefficient, - author = {McMahan, Brendan and Moore, Eider and Ramage, Daniel and Hampson, Seth and y Arcas, Blaise Ag\"uera}, - editor = {Singh, Aarti and Zhu, Xiaojin (Jerry)}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/aistats/McMahanMRHA17.bib}, - booktitle = {Proceedings of the 20th International Conference on Artificial Intelligence and Statistics, AISTATS 2017, 20-22 April 2017, Fort Lauderdale, FL, USA}, - pages = {1273--1282}, - publisher = {PMLR}, - series = {Proceedings of Machine Learning Research}, - timestamp = {Wed, 03 Apr 2019 01:00:00 +0200}, - title = {Communication-Efficient Learning of Deep Networks from Decentralized Data}, - url = {http://proceedings.mlr.press/v54/mcmahan17a.html}, - volume = {54}, - year = {2017}, + author = {McMahan, Brendan and Moore, Eider and Ramage, Daniel and Hampson, Seth and y Arcas, Blaise Agüera}, + title = {Communication-Efficient Learning of Deep Networks from Decentralized Data.}, + journal = {AISTATS}, + pages = {1273--1282}, + year = {2017}, + url = {http://proceedings.mlr.press/v54/mcmahan17a.html}, + source = {DBLP}, + editor = {Singh, Aarti and Zhu, Xiaojin (Jerry)}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/aistats/McMahanMRHA17.bib}, + booktitle = {Proceedings of the 20th International Conference on Artificial Intelligence and Statistics, AISTATS 2017, 20-22 April 2017, Fort Lauderdale, FL, USA}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + timestamp = {Wed, 03 Apr 2019 01:00:00 +0200}, + volume = {54}, } @inproceedings{paszke2019pytorch, - author = {Ansel, Jason and Yang, Edward and He, Horace and Gimelshein, Natalia and Jain, Animesh and Voznesensky, Michael and Bao, Bin and Bell, Peter and Berard, David and Burovski, Evgeni and Chauhan, Geeta and Chourdia, Anjali and Constable, Will and Desmaison, Alban and DeVito, Zachary and Ellison, Elias and Feng, Will and Gong, Jiong and Gschwind, Michael and Hirsh, Brian and Huang, Sherlock and Kalambarkar, Kshiteej and Kirsch, Laurent and Lazos, Michael and Lezcano, Mario and Liang, Yanbo and Liang, Jason and Lu, Yinghai and Luk, C. K. and Maher, Bert and Pan, Yunjie and Puhrsch, Christian and Reso, Matthias and Saroufim, Mark and Siraichi, Marcos Yukio and Suk, Helen and Zhang, Shunting and Suo, Michael and Tillet, Phil and Zhao, Xu and Wang, Eikan and Zhou, Keren and Zou, Richard and Wang, Xiaodong and Mathews, Ajit and Wen, William and Chanan, Gregory and Wu, Peng and Chintala, Soumith}, - editor = {Wallach, Hanna M. and Larochelle, Hugo and Beygelzimer, Alina and d'Alch\'e-Buc, Florence and Fox, Emily B. and Garnett, Roman}, - bibsource = {dblp computer science bibliography, https://dblp.org}, - biburl = {https://dblp.org/rec/conf/nips/PaszkeGMLBCKLGA19.bib}, - booktitle = {Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2}, - pages = {8024--8035}, - timestamp = {Thu, 21 Jan 2021 00:00:00 +0100}, - title = {{PyTorch} 2: {Faster} Machine Learning Through Dynamic Python Bytecode Transformation and Graph Compilation}, - url = {https://doi.org/10.1145/3620665.3640366}, - year = {2024}, - doi = {10.1145/3620665.3640366}, - source = {Crossref}, - publisher = {ACM}, - month = apr, + doi = {10.1145/3620665.3640366}, + pages = {929--947}, + source = {Crossref}, + author = {Ansel, Jason and Yang, Edward and He, Horace and Gimelshein, Natalia and Jain, Animesh and Voznesensky, Michael and Bao, Bin and Bell, Peter and Berard, David and Burovski, Evgeni and Chauhan, Geeta and Chourdia, Anjali and Constable, Will and Desmaison, Alban and DeVito, Zachary and Ellison, Elias and Feng, Will and Gong, Jiong and Gschwind, Michael and Hirsh, Brian and Huang, Sherlock and Kalambarkar, Kshiteej and Kirsch, Laurent and Lazos, Michael and Lezcano, Mario and Liang, Yanbo and Liang, Jason and Lu, Yinghai and Luk, C. K. and Maher, Bert and Pan, Yunjie and Puhrsch, Christian and Reso, Matthias and Saroufim, Mark and Siraichi, Marcos Yukio and Suk, Helen and Zhang, Shunting and Suo, Michael and Tillet, Phil and Zhao, Xu and Wang, Eikan and Zhou, Keren and Zou, Richard and Wang, Xiaodong and Mathews, Ajit and Wen, William and Chanan, Gregory and Wu, Peng and Chintala, Soumith}, + date = {2024-04-27}, + url = {https://doi.org/10.1145/3620665.3640366}, + booktitle = {Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2}, + publisher = {ACM}, + title = {PyTorch 2: Faster Machine Learning Through Dynamic Python Bytecode Transformation and Graph Compilation}, + editor = {Wallach, Hanna M. and Larochelle, Hugo and Beygelzimer, Alina and d'Alché-Buc, Florence and Fox, Emily B. and Garnett, Roman}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/conf/nips/PaszkeGMLBCKLGA19.bib}, + timestamp = {Thu, 21 Jan 2021 00:00:00 +0100}, } @inproceedings{seide2016cntk, - author = {Seide, Frank and Agarwal, Amit}, - booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, - doi = {10.1145/2939672.2945397}, - pages = {2135--2135}, - publisher = {ACM}, - source = {Crossref}, - subtitle = {Microsoft's Open-Source Deep-Learning Toolkit}, - title = {Cntk}, - url = {https://doi.org/10.1145/2939672.2945397}, - year = {2016}, - month = aug, + doi = {10.1145/2939672.2945397}, + source = {Crossref}, + author = {Seide, Frank and Agarwal, Amit}, + subtitle = {Microsoft's Open-Source Deep-Learning Toolkit}, + date = {2016-08-13}, + url = {https://doi.org/10.1145/2939672.2945397}, + booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, + publisher = {ACM}, + title = {CNTK}, + pages = {2135--2135}, } @inproceedings{tokui2015chainer, - author = {Tokui, Seiya and Okuta, Ryosuke and Akiba, Takuya and Niitani, Yusuke and Ogawa, Toru and Saito, Shunta and Suzuki, Shuji and Uenishi, Kota and Vogel, Brian and Yamazaki Vincent, Hiroyuki}, - booktitle = {Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining}, - doi = {10.1145/3292500.3330756}, - pages = {1--6}, - publisher = {ACM}, - source = {Crossref}, - subtitle = {A Deep Learning Framework for Accelerating the Research Cycle}, - title = {Chainer}, - url = {https://doi.org/10.1145/3292500.3330756}, - volume = {5}, - year = {2019}, - month = jul, -} + doi = {10.1145/3292500.3330756}, + pages = {2002--2011}, + source = {Crossref}, + author = {Tokui, Seiya and Okuta, Ryosuke and Akiba, Takuya and Niitani, Yusuke and Ogawa, Toru and Saito, Shunta and Suzuki, Shuji and Uenishi, Kota and Vogel, Brian and Yamazaki Vincent, Hiroyuki}, + subtitle = {A Deep Learning Framework for Accelerating the Research Cycle}, + date = {2019-07-25}, + url = {https://doi.org/10.1145/3292500.3330756}, + booktitle = {Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining}, + publisher = {ACM}, + title = {Chainer}, + volume = {5}, +} \ No newline at end of file diff --git a/contents/core/frameworks/frameworks.qmd b/contents/core/frameworks/frameworks.qmd index 3cec953b..79a89fde 100644 --- a/contents/core/frameworks/frameworks.qmd +++ b/contents/core/frameworks/frameworks.qmd @@ -20,29 +20,21 @@ AI frameworks are a critical middleware software layer that that transforms abst ## Learning Objectives -* Understand the evolution and capabilities of major machine learning frameworks. This includes graph execution models, programming paradigms, hardware acceleration support, and how they have expanded over time. +* Understand the historical progression of machine learning frameworks from early numerical libraries to modern deep learning systems -* Learn frameworks' components and functionality, such as computational graphs, data pipelines, optimization algorithms, training loops, etc., that enable efficient model building. +* Under the framework fundamentals such as tensor data structures, computational graphs, execution models, and memory management in ML frameworks -* Compare frameworks across different environments, such as cloud, edge, and TinyML. Learn how frameworks specialize based on computational constraints and hardware. - -* Dive deeper into embedded and TinyML-focused frameworks like TensorFlow Lite Micro, CMSIS-NN, TinyEngine, etc., and how they optimize for microcontrollers. - -* When choosing a framework, explore model conversion and deployment considerations, including latency, memory usage, and hardware support. - -* Evaluate key factors in selecting the right framework, like performance, hardware compatibility, community support, ease of use, etc., based on the specific project needs and constraints. - -* Understand the limitations of current frameworks and potential future trends, such as using ML to improve frameworks, decomposed ML systems, and high-performance compilers. +* Differentiate between machine learning frameworks architectures, execution strategies, and development tools +* Understand the specialization of machine learning frameworks for cloud, edge, mobile, and TinyML applications +* ::: ## Overview Modern machine learning development relies fundamentally on machine learning frameworks, which are comprehensive software libraries or platforms designed to simplify the development, training, and deployment of machine learning models. These frameworks serve multiple roles in ML systems, much like operating systems are the foundation of computing systems. Just as operating systems abstract away the complexity of hardware resources and provide standardized interfaces for applications, ML frameworks abstract the intricacies of mathematical operations and hardware acceleration, providing standardized APIs for ML development. -The capabilities of ML frameworks are diverse and continuously evolving. They provide efficient implementations of mathematical operations, automatic differentiation capabilities, and tools for managing model development, hardware acceleration, and memory utilization. For production systems, they offer standardized approaches to model deployment, versioning, and optimization. However, due to their diversity, there is no universally agreed-upon definition of an ML framework. - -To establish clarity for this chapter, we adopt the following definition: +The capabilities of ML frameworks are diverse and continuously evolving. They provide efficient implementations of mathematical operations, automatic differentiation capabilities, and tools for managing model development, hardware acceleration, and memory utilization. For production systems, they offer standardized approaches to model deployment, versioning, and optimization. However, due to their diversity, there is no universally agreed-upon definition of an ML framework. To establish clarity for this chapter, we adopt the following definition: ::: {.callout-note} @@ -64,83 +56,77 @@ The evolution of machine learning frameworks mirrors the broader development of ### Early Numerical Libraries -The foundation for modern ML frameworks begins at the most fundamental level of computation: matrix operations. Machine learning computations are primarily matrix-matrix and matrix-vector multiplications. The Basic Linear Algebra Subprograms (BLAS), developed in 1979, provided these essential matrix operations that would become the computational backbone of machine learning. These low-level operations, when combined and executed efficiently, enable the complex calculations required for training neural networks and other ML models. +The foundation for modern ML frameworks begins at the most fundamental level of computation: matrix operations. Machine learning computations are primarily matrix-matrix and matrix-vector multiplications. The Basic Linear Algebra Subprograms ([BLAS](https://www.netlib.org/blas/)), developed in 1979, provided these essential matrix operations that would become the computational backbone of machine learning [@kung1979systolic]. These low-level operations, when combined and executed efficiently, enable the complex calculations required for training neural networks and other ML models. -Building upon BLAS, the Linear Algebra Package (LAPACK) emerged in 1992, extending these capabilities with more sophisticated linear algebra operations such as matrix decompositions, eigenvalue problems, and linear system solutions. This layered approach of building increasingly complex operations from fundamental matrix computations became a defining characteristic of ML frameworks. +Building upon BLAS, the Linear Algebra Package ([LAPACK](https://www.netlib.org/lapack/)) emerged in 1992, extending these capabilities with more sophisticated linear algebra operations such as matrix decompositions, eigenvalue problems, and linear system solutions. This layered approach of building increasingly complex operations from fundamental matrix computations became a defining characteristic of ML frameworks. -The development of NumPy in 2006 marked a crucial milestone in this evolution, building upon its predecessors Numeric and Numarray to become the fundamental package for numerical computation in Python. NumPy introduced n-dimensional array objects and essential mathematical functions, but more importantly, it provided an efficient interface to these underlying BLAS and LAPACK operations. This abstraction allowed developers to work with high-level array operations while maintaining the performance of optimized low-level matrix computations. +The development of [NumPy](https://numpy.org/) in 2006 marked a crucial milestone in this evolution, building upon its predecessors Numeric and Numarray to become the fundamental package for numerical computation in Python. NumPy introduced n-dimensional array objects and essential mathematical functions, but more importantly, it provided an efficient interface to these underlying BLAS and LAPACK operations. This abstraction allowed developers to work with high-level array operations while maintaining the performance of optimized low-level matrix computations. -In 2001, SciPy emerged as a powerful extension built on top of NumPy, adding specialized functions for optimization, linear algebra, and signal processing. This further exemplified the pattern of progressive abstraction in ML frameworks: from basic matrix operations to sophisticated numerical computations, and eventually to high-level machine learning algorithms. This layered architecture, starting from fundamental matrix operations and building upward, would become a blueprint for future ML frameworks. +In 2001, [SciPy](https://scipy.org/) emerged as a powerful extension built on top of NumPy, adding specialized functions for optimization, linear algebra, and signal processing. This further exemplified the pattern of progressive abstraction in ML frameworks: from basic matrix operations to sophisticated numerical computations, and eventually to high-level machine learning algorithms. This layered architecture, starting from fundamental matrix operations and building upward, would become a blueprint for future ML frameworks. ### First-Generation ML Frameworks The transition from numerical libraries to dedicated machine learning frameworks marked a crucial evolution in abstraction. While the underlying computations remained rooted in matrix operations, frameworks began to encapsulate these operations into higher-level machine learning primitives. The University of Waikato introduced Weka in 1993, one of the earliest ML frameworks, which abstracted matrix operations into data mining tasks, though it was limited by its Java implementation and focus on smaller-scale computations. -Scikit-learn, emerging in 2007, was a significant advancement in this abstraction. Building upon the NumPy and SciPy foundation, it transformed basic matrix operations into intuitive ML algorithms. For example, what was fundamentally a series of matrix multiplications and gradient computations became a simple "fit()" method call in a logistic regression model. This abstraction pattern - hiding complex matrix operations behind clean APIs - would become a defining characteristic of modern ML frameworks. +[Scikit-learn](https://scikit-learn.org/stable/), emerging in 2007, was a significant advancement in this abstraction. Building upon the NumPy and SciPy foundation, it transformed basic matrix operations into intuitive ML algorithms. For example, what was fundamentally a series of matrix multiplications and gradient computations became a simple `fit()` method call in a logistic regression model. This abstraction pattern - hiding complex matrix operations behind clean APIs - would become a defining characteristic of modern ML frameworks. -Theano, which appeared in 2007, was a major advancement---developed at the Montreal Institute for Learning Algorithms (MILA)---Theano introduced two revolutionary concepts: computational graphs and GPU acceleration. Computational graphs represented mathematical operations as directed graphs, with matrix operations as nodes and data flowing between them. This graph-based approach allowed for automatic differentiation and optimization of the underlying matrix operations. More importantly, it enabled the framework to automatically route these operations to GPU hardware, dramatically accelerating matrix computations. +[Theano](https://deeplearning.net/software/theano/), which appeared in 2007, was a major advancement—developed at the Montreal Institute for Learning Algorithms (MILA)—Theano introduced two revolutionary concepts: computational graphs and GPU acceleration [@al2016theano]. Computational graphs represented mathematical operations as directed graphs, with matrix operations as nodes and data flowing between them. This graph-based approach allowed for automatic differentiation and optimization of the underlying matrix operations. More importantly, it enabled the framework to automatically route these operations to GPU hardware, dramatically accelerating matrix computations. -Meanwhile, Torch, created at NYU in 2002, took a different approach to handling matrix operations. It emphasized immediate execution of operations (eager execution) and provided a flexible interface for neural network implementations. Torch's design philosophy of prioritizing developer experience while maintaining high performance influenced many subsequent frameworks. Its architecture demonstrated how to balance high-level abstractions with efficient low-level matrix operations, establishing design patterns that would later influence frameworks like PyTorch. +Meanwhile, [Torch](http://torch.ch/), created at NYU in 2002, took a different approach to handling matrix operations. It emphasized immediate execution of operations (eager execution) and provided a flexible interface for neural network implementations. Torch's design philosophy of prioritizing developer experience while maintaining high performance influenced many subsequent frameworks. Its architecture demonstrated how to balance high-level abstractions with efficient low-level matrix operations, establishing design patterns that would later influence frameworks like PyTorch. ### Rise of Deep Learning Frameworks The deep learning revolution demanded a fundamental shift in how frameworks handled matrix operations, primarily due to three factors: the massive scale of computations, the complexity of gradient calculations through deep networks, and the need for distributed processing. Traditional frameworks, designed for classical machine learning algorithms, couldn't efficiently handle the billions of matrix operations required for training deep neural networks. -Caffe, released by UC Berkeley in 2013, pioneered this new generation by introducing specialized implementations of convolutional operations. While convolutions are mathematically equivalent to specific patterns of matrix multiplication, Caffe optimized these patterns specifically for computer vision tasks, demonstrating how specialized matrix operation implementations could dramatically improve performance for specific network architectures. +[Caffe](https://caffe.berkeleyvision.org/), released by UC Berkeley in 2013, pioneered this new generation by introducing specialized implementations of convolutional operations [@jia2014caffe]. While convolutions are mathematically equivalent to specific patterns of matrix multiplication, Caffe optimized these patterns specifically for computer vision tasks, demonstrating how specialized matrix operation implementations could dramatically improve performance for specific network architectures. -Google's TensorFlow, introduced in 2015, revolutionized the field by treating matrix operations as part of a distributed computing problem. It represented all computations, from individual matrix multiplications to entire neural networks, as a static computational graph that could be split across multiple devices. This approach enabled training of unprecedented model sizes by distributing matrix operations across clusters of computers and specialized hardware. TensorFlow's static graph approach, while initially constraining, allowed for aggressive optimization of matrix operations through techniques like kernel fusion and memory planning. +Google's [TensorFlow](https://www.tensorflow.org/), introduced in 2015, revolutionized the field by treating matrix operations as part of a distributed computing problem [@dean2012large]. It represented all computations, from individual matrix multiplications to entire neural networks, as a static computational graph that could be split across multiple devices. This approach enabled training of unprecedented model sizes by distributing matrix operations across clusters of computers and specialized hardware. TensorFlow's static graph approach, while initially constraining, allowed for aggressive optimization of matrix operations through techniques like kernel fusion and memory planning. -Facebook's PyTorch, launched in 2016, took a radically different approach to handling matrix computations. Instead of static graphs, PyTorch introduced dynamic computational graphs that could be modified on the fly. This dynamic approach, while potentially sacrificing some optimization opportunities, made it much easier for researchers to debug and understand the flow of matrix operations in their models. PyTorch's success demonstrated that the ability to introspect and modify computations dynamically was as important as raw performance for many applications. +Facebook's [PyTorch](https://pytorch.org/), launched in 2016, took a radically different approach to handling matrix computations. Instead of static graphs, PyTorch introduced dynamic computational graphs that could be modified on the fly [@paszke2019pytorch]. This dynamic approach, while potentially sacrificing some optimization opportunities, made it much easier for researchers to debug and understand the flow of matrix operations in their models. PyTorch's success demonstrated that the ability to introspect and modify computations dynamically was as important as raw performance for many applications. -Amazon's MXNet approached the challenge of large-scale matrix operations by focusing on memory efficiency and scalability across different hardware configurations. It introduced a hybrid approach that combined aspects of both static and dynamic graphs, allowing for flexible model development while still enabling aggressive optimization of the underlying matrix operations. +Amazon's [MXNet](https://mxnet.apache.org/) approached the challenge of large-scale matrix operations by focusing on memory efficiency and scalability across different hardware configurations. It introduced a hybrid approach that combined aspects of both static and dynamic graphs, allowing for flexible model development while still enabling aggressive optimization of the underlying matrix operations. ### Hardware Influence on Design -Hardware developments have fundamentally reshaped how frameworks implement and optimize matrix operations. The introduction of NVIDIA's CUDA platform in 2007 marked a pivotal moment in framework design by enabling general-purpose computing on GPUs. This was transformative because GPUs excel at parallel matrix operations, offering orders of magnitude speedup for the core computations in deep learning. While a CPU might process matrix elements sequentially, a GPU can process thousands of elements simultaneously, fundamentally changing how frameworks approach computation scheduling. +Hardware developments have fundamentally reshaped how frameworks implement and optimize matrix operations. The introduction of [NVIDIA's CUDA platform](https://developer.nvidia.com/cuda-toolkit) in 2007 marked a pivotal moment in framework design by enabling general-purpose computing on GPUs. This was transformative because GPUs excel at parallel matrix operations, offering orders of magnitude speedup for the core computations in deep learning. While a CPU might process matrix elements sequentially, a GPU can process thousands of elements simultaneously, fundamentally changing how frameworks approach computation scheduling. -The development of hardware-specific accelerators further revolutionized framework design. Google's Tensor Processing Units (TPUs), first deployed in 2016, were purpose-built for tensor operations, the fundamental building blocks of deep learning computations. TPUs introduced systolic array architectures, which are particularly efficient for matrix multiplication and convolution operations. This hardware architecture prompted frameworks like TensorFlow to develop specialized compilation strategies that could map high-level operations directly to TPU instructions, bypassing traditional CPU-oriented optimizations. +The development of hardware-specific accelerators further revolutionized framework design. [Google's Tensor Processing Units (TPUs)](https://cloud.google.com/tpu/), first deployed in 2016, were purpose-built for tensor operations, the fundamental building blocks of deep learning computations. TPUs introduced systolic array architectures[^defn-systolic], which are particularly efficient for matrix multiplication and convolution operations. This hardware architecture prompted frameworks like TensorFlow to develop specialized compilation strategies that could map high-level operations directly to TPU instructions, bypassing traditional CPU-oriented optimizations. -Mobile hardware accelerators, such as Apple's Neural Engine (2017) and Qualcomm's Neural Processing Units, brought new constraints and opportunities to framework design. These devices emphasized power efficiency over raw computational speed, requiring frameworks to develop new strategies for quantization and operator fusion. Mobile frameworks like TensorFlow Lite and PyTorch Mobile needed to balance model accuracy with energy consumption, leading to innovations in how matrix operations are scheduled and executed. +[^defn-systolic]: **Systolic Array:** A hardware architecture designed to perform a series of parallel computations in a time-synchronized manner, optimizing the flow of data through a grid of processors for tasks like matrix multiplication. -The emergence of custom ASIC (Application-Specific Integrated Circuit) solutions has further diversified the hardware landscape. Companies like Graphcore, Cerebras, and SambaNova have developed unique architectures for matrix computation, each with different strengths and optimization opportunities. This proliferation of specialized hardware has pushed frameworks to adopt more flexible intermediate representations of matrix operations, allowing for target-specific optimization while maintaining a common high-level interface. +Mobile hardware accelerators, such as [Apple's Neural Engine (2017)](https://www.apple.com/newsroom/2017/09/iphone-x/) and Qualcomm's Neural Processing Units, brought new constraints and opportunities to framework design. These devices emphasized power efficiency over raw computational speed, requiring frameworks to develop new strategies for quantization and operator fusion[^fn-operation-fusion]. Mobile frameworks like TensorFlow Lite and PyTorch Mobile needed to balance model accuracy with energy consumption, leading to innovations in how matrix operations are scheduled and executed. -Field Programmable Gate Arrays (FPGAs) introduced yet another dimension to framework optimization. Unlike fixed-function ASICs, FPGAs allow for reconfigurable circuits that can be optimized for specific matrix operation patterns. Frameworks responding to this capability developed just-in-time compilation strategies that could generate optimized hardware configurations based on the specific needs of a model. - -### Industry Impact +[^fn-operation-fusion]: **Operation fusion:** A technique that combines multiple consecutive operations into a single kernel to reduce memory bandwidth usage and improve computational efficiency, particularly for element-wise operations. -Industry requirements fundamentally changed how machine learning frameworks evolved from research tools to production systems. As companies began deploying models at scale, they encountered challenges that academic implementations hadn't addressed. The need to serve millions of predictions per second while maintaining consistent latency led to the development of specialized serving systems. TensorFlow Serving pioneered this space by introducing model versioning, batching optimizations, and efficient resource management. PyTorch followed with TorchServe, focusing on making model deployment more intuitive for developers. +The emergence of custom ASIC (Application-Specific Integrated Circuit)[^defn-asic] solutions has further diversified the hardware landscape. Companies like Graphcore, Cerebras, and SambaNova have developed unique architectures for matrix computation, each with different strengths and optimization opportunities. This proliferation of specialized hardware has pushed frameworks to adopt more flexible intermediate representations of matrix operations, allowing for target-specific optimization while maintaining a common high-level interface. -Cloud computing introduced new possibilities and challenges for framework development. The ability to access vast computational resources demanded frameworks that could efficiently scale across hundreds or thousands of machines. This led to innovations in distributed training architectures, where frameworks had to orchestrate complex matrix operations across network-connected devices while managing communication overhead. Cloud providers developed specialized services like AWS SageMaker and Google Cloud ML Engine, which influenced how frameworks approached model training and deployment. +[^defn-asic]: **Application-Specific Integrated Circuit (ASIC):** is a custom-built hardware chip optimized for specific tasks, such as matrix computations in deep learning, offering superior performance and energy efficiency compared to general-purpose processors. -Enterprise adoption brought requirements for production-grade features that research-oriented frameworks hadn't prioritized. Organizations needed robust monitoring to track model performance and resource usage in real-time. Version control became crucial as teams collaborated on model development, leading to the integration of model registries and artifact tracking. Security considerations drove the development of encrypted computation capabilities and access controls within frameworks. - -The rise of automated machine learning (AutoML) platforms pushed frameworks to become more modular. These platforms needed to programmatically construct and modify models, requiring frameworks to expose standardized interfaces for model architecture manipulation. This led to the development of higher-level APIs that could abstract away the complexity of model construction while maintaining performance. +Field Programmable Gate Arrays (FPGAs) introduced yet another dimension to framework optimization. Unlike fixed-function ASICs, FPGAs allow for reconfigurable circuits that can be optimized for specific matrix operation patterns. Frameworks responding to this capability developed just-in-time compilation strategies that could generate optimized hardware configurations based on the specific needs of a model. -The increasing focus on model governance and compliance influenced framework development in unexpected ways. Organizations needed to track model lineage, explain predictions, and ensure reproducibility. Frameworks responded by adding capabilities for model documentation, experiment tracking, and deterministic computation modes. These features became as important as raw computational performance for many enterprise deployments. +## Framework Fundamentals -This evolution from pure research tools to enterprise-ready platforms marks a significant maturation in the machine learning ecosystem. Modern frameworks must balance multiple competing demands: maintaining the flexibility and ease of use that researchers require, while providing the robustness and scalability that industry demands. This balance has led to the emergence of rich framework ecosystems, where core libraries are supplemented by specialized tools for deployment, monitoring, and governance. As machine learning continues to be adopted across industries, frameworks continue to evolve, incorporating lessons learned from real-world deployments while maintaining their foundational role in advancing the field. +Modern machine learning frameworks operate through the integration of four key layers: Fundamentals, Data Handling, Developer Interface, and Execution and Abstraction. These layers function together to provide a structured and efficient foundation for model development and deployment, as illustrated in Figure @fig-fm_blocks. -## Framework Fundamentals +![Framework component interaction.](images/png/fm_building_blocks.png){#fig-fm_blocks width=80%} -The core of modern ML frameworks rests on four main aspects that work in concert to enable efficient machine learning development. @fig-fm_blocks illustrates the relationship between the four core components: computational graphs, data structures, programming models, and core operations. +The Fundamentals layer establishes the structural basis of these frameworks through computational graphs. These graphs represent the operations within a model as directed acyclic graphs (DAGs), enabling automatic differentiation and optimization. By organizing operations and data dependencies, computational graphs provide the framework with the ability to distribute workloads and execute computations efficiently across a variety of hardware platforms. -At the lowest level (i.e., the foundation level), computational graphs are the mathematical backbone, representing model operations and enabling efficient computation across hardware platforms. These graphs provide the essential structure for automatic differentiation and optimization, allowing frameworks to analyze and distribute workloads effectively. +The Data Handling layer manages numerical data and parameters essential for machine learning workflows. Central to this layer are specialized data structures, such as tensors, which handle high-dimensional arrays while optimizing memory usage and device placement. Additionally, memory management and data movement strategies ensure that computational workloads are executed efficiently, particularly in environments with diverse or limited hardware resources. -![Framework component interaction.](images/png/fm_building_blocks.png){#fig-fm_blocks width=80%} +The Developer Interface layer provides the tools and abstractions through which users interact with the framework. Programming models, whether imperative or symbolic, allow developers to define machine learning algorithms in a manner suited to their specific needs. Imperative models offer flexibility and ease of debugging, while symbolic models prioritize performance and deployment efficiency. Execution models further shape this interaction by defining whether computations are carried out eagerly (immediately) or as pre-optimized static graphs. -Built upon this foundation, specialized data structures form the next layer of the framework architecture. Chief among these are tensors, which act as the primary containers for numerical data and model parameters. These structures are meticulously designed to handle the high-dimensional arrays typical in machine learning while efficiently managing memory and device placement. +The Execution and Abstraction layer transforms these high-level representations into efficient hardware-executable operations. Core operations, encompassing everything from basic linear algebra to complex neural network layers, are highly optimized for diverse hardware platforms. This layer also includes mechanisms for allocating resources and managing memory dynamically, ensuring robust and scalable performance in both training and inference settings. -The programming model of a framework defines how developers interact with these underlying structures and operations. Modern frameworks offer a range of approaches, from imperative programming paradigms (flexible and interactive) to symbolic paradigms (optimized for deployment). Each approach provides distinct advantages in terms of development experience, debugging capabilities, and execution efficiency. The choice of programming model greatly influences how algorithms are expressed and how frameworks optimize their execution. +Understanding these interconnected layers is essential for leveraging machine learning frameworks effectively. Each layer plays a distinct yet interdependent role in facilitating experimentation, optimization, and deployment. By mastering these concepts, practitioners can make informed decisions about resource utilization, scaling strategies, and the suitability of specific frameworks for various tasks. -At the highest level of abstraction, frameworks implement core operations that handle essential mathematical and machine learning computations. These operations, ranging from basic linear algebra to complex neural network layers, are optimized for performance across various hardware platforms. They serve as the building blocks for constructing sophisticated machine learning models. -When selecting a downstream framework for deployment, a solid grasp of these concepts is essential. This knowledge enables informed decisions about resource utilization, scaling strategies, and deployment architectures. In production systems, understanding these aspects ensures efficient performance and scalability. ### Computational Graphs -The computational graph is the basic abstraction in modern machine learning frameworks. It bridges the gap between high-level model descriptions and low-level hardware execution. +The computational graph is the basic abstraction in modern machine learning frameworks. It bridges the gap between high-level model descriptions and low-level hardware execution [@Baydin_2018]. -A computational graph represents a machine learning model as a directed acyclic graph (DAG) where nodes represent operations and edges represent data flow. This representation enables frameworks to perform system-level optimizations, manage hardware resources, and efficiently execute complex mathematical operations across diverse computing platforms. +A computational graph represents a machine learning model as a directed acyclic graph (DAG) where nodes represent operations and edges represent data flow. This representation enables frameworks to perform system-level optimizations, manage hardware resources, and efficiently execute complex mathematical operations across diverse computing platforms [@Abadi_2016]. As shown in @fig-mlfm-static-graph, the structure of the computation graph involves defining a sequence of interconnected layers, such as convolution, activation, pooling, and normalization, which are optimized before execution. The figure also highlights system-level interactions, including memory management and device placement, showcasing how the static graph approach enables comprehensive pre-execution analysis and resource allocation. @@ -163,7 +149,9 @@ This upfront definition enables powerful system-level optimizations. The framewo #### Dynamic Graphs -Dynamic computation graphs, popularized by PyTorch, implement a "define-by-run" execution model. This approach constructs the graph during execution, offering greater flexibility in model definition and debugging. From a systems perspective, this architectural choice presents a different set of tradeoffs. The dynamic approach enables immediate execution of operations and allows graph structure to depend on runtime conditions. +Dynamic computation graphs, popularized by PyTorch, implement a "define-by-run" execution model. This approach constructs the graph during execution, offering greater flexibility in model definition and debugging. Unlike static graphs, which rely on predefined memory allocation, dynamic graphs allocate memory as operations execute, making them susceptible to memory fragmentation[^defn-memory-fragmentation] in long-running tasks. + +[^defn-memory-fragmentation]: **Memory Fragmentation:** The inefficient use of memory caused by small, unused gaps between allocated memory blocks, often resulting in wasted memory or reduced performance. As shown in @fig-mlfm-dynamic-graph-flow, each operation is defined, executed, and completed before moving on to define the next operation. This contrasts sharply with static graphs, where all operations must be defined upfront. When an operation is defined, it is immediately executed, and its results become available for subsequent operations or for inspection during debugging. This cycle continues until all operations are complete. @@ -218,3 +206,648 @@ These hybrid solutions aim to provide the flexibility of dynamic graphs during d +-----------------------------------+------------------------------------------------------+------------------------------------------------------------+ : Comparsion of static and dynamic computational graphs. {#tbl-mlfm-graphs .hover .striped} + +### Data Structures + +Machine learning frameworks build upon computational graphs through specialized data structures that bridge high-level computations with practical implementation. These data structures serve two essential purposes: they provide containers for the numerical data that powers machine learning models, and they manage how this data is stored and moved across different memory spaces and devices. + +While computational graphs specify the logical flow of operations, data structures determine how these operations actually access and manipulate data in memory. This dual role of organizing numerical data for model computations while handling the complexities of memory management and device placement shapes how frameworks translate mathematical operations into efficient executions across diverse computing platforms. + +The effectiveness of machine learning frameworks depends heavily on their underlying data organization. While machine learning theory can be expressed through mathematical equations, turning these equations into practical implementations demands thoughtful consideration of data organization, storage, and manipulation. Modern machine learning models must process enormous amounts of data during training and inference, making efficient data access and memory usage critical across diverse hardware platforms. + +A framework's data structures must excel in three key areas. First, they need to deliver high performance, supporting rapid data access and efficient memory use across different hardware. This includes optimizing memory layouts for cache efficiency and enabling smooth data transfer between memory hierarchies and devices. Second, they must offer flexibility, accommodating various model architectures and training approaches while supporting different data types and precision requirements. Third, they should provide clear and intuitive interfaces to developers while handling complex memory management and device placement behind the scenes. + +These data structures serve as a bridge between mathematical concepts and practical computing systems. The core operations in machine learning—matrix multiplication, convolution, activation functions—set basic requirements for how data must be organized. These structures must maintain numerical precision and stability while enabling efficient implementation of common operations and automatic gradient computation. However, they must also work within real-world computing constraints, dealing with limited memory bandwidth, varying hardware capabilities, and the needs of distributed computing. + +The design choices made in implementing these data structures significantly influence what machine learning frameworks can achieve. Poor decisions in data structure design can result in excessive memory use, limiting model size and batch capabilities. They might create performance bottlenecks that slow down training and inference, or produce interfaces that make programming error-prone. On the other hand, thoughtful design enables automatic optimization of memory usage and computation, efficient scaling across hardware configurations, and intuitive programming interfaces that support rapid implementation of new techniques. + +As we explore specific data structures in the following sections, we'll examine how frameworks address these challenges through careful design decisions and optimization approaches. This understanding proves essential for anyone working with machine learning systems, whether developing new models, optimizing existing ones, or creating new framework capabilities. We begin with tensor abstractions, the fundamental building blocks of modern machine learning frameworks, before exploring more specialized structures for parameter management, dataset handling, and execution control. + +#### Tensor Fundamentals + +Machine learning frameworks process and store numerical data as tensors. Every computation in a neural network, from processing input data to updating model weights, operates on tensors. Training batches of images, activation maps in convolutional networks, and parameter gradients during backpropagation all take the form of tensors. This unified representation allows frameworks to implement consistent interfaces for data manipulation and optimize operations across different hardware architectures. + +##### Structure and Dimensionality + +A tensor is a mathematical object that generalizes scalars, vectors, and matrices to higher dimensions. In its simplest form, a scalar is a zero-dimensional tensor containing a single value, a vector is a one-dimensional tensor containing a sequence of values, and a matrix is a two-dimensional tensor containing values arranged in rows and columns. Higher-dimensional tensors extend this pattern - a three-dimensional tensor can be visualized as a stack of matrices, while tensors of even higher dimensions follow similar patterns of nested structure.For example, a grayscale image is represented as a 2D matrix of pixel values. A color image adds RGB channels, forming a 3D tensor. Batch processing adds a fourth dimension, resulting in a 4D tensor. + +In machine learning frameworks, tensors take on additional properties beyond their mathematical definition to meet the demands of modern ML systems. While mathematical tensors provide a foundation as multi-dimensional arrays with transformation properties, machine learning introduces requirements for practical computation. These requirements shape how frameworks balance mathematical precision with computational performance. + +Framework tensors combine numerical data arrays with computational metadata. The dimensional structure, or shape, ranges from simple vectors and matrices to higher-dimensional arrays that represent complex data like image batches or sequence models. This dimensional information plays a critical role in operation validation and optimization. Matrix multiplication operations, for example, depend on shape metadata to verify dimensional compatibility and determine optimal computation paths. + +Memory layout implementation introduces distinct challenges in tensor design. While tensors provide an abstraction of multi-dimensional data, physical computer memory remains linear. Stride patterns address this disparity by creating mappings between multi-dimensional tensor indices and linear memory addresses. These patterns significantly impact computational performance by determining memory access patterns during tensor operations. Careful alignment of stride patterns with hardware memory hierarchies maximizes cache efficiency and memory throughput. + +##### Type Systems and Precision + +Tensor implementations use type systems to control numerical precision and memory consumption. The standard choice in machine learning has been 32-bit floating-point numbers (```float32```), offering a balance of precision and efficiency. Modern frameworks extend this with multiple numeric types for different needs. Integer types support indexing and embedding operations. Reduced-precision types like 16-bit floating-point numbers enable efficient mobile deployment. 8-bit integers allow fast inference on specialized hardware. + +The choice of numeric type affects both model behavior and computational efficiency. Neural network training typically requires float32 precision to maintain stable gradient computations. Inference tasks can often use lower precision (```int8``` or even ```int4```), reducing memory usage and increasing processing speed. Mixed-precision training[^defn-mixed-precision] approaches combine these benefits by using float32 for critical accumulations while performing most computations at lower precision. + +[^defn-mixed-precision]: **Mixed-precision training:** A training approach that uses lower-precision arithmetic for most calculations while retaining higher-precision for critical operations, balancing performance and numerical stability. + +Type conversions between different numeric representations require careful management. Operating on tensors with different types demands explicit conversion rules to preserve numerical correctness. These conversions introduce computational costs and risk precision loss. Frameworks provide type casting capabilities but rely on developers to maintain numerical precision across operations. + +##### Device Placement and Memory Management + +The rise of heterogeneous computing has transformed how machine learning frameworks manage tensor operations. Modern frameworks must seamlessly operate across CPUs, GPUs, TPUs, and various other accelerators, each offering different computational advantages and memory characteristics. This diversity creates a fundamental challenge: tensors must move efficiently between devices while maintaining computational coherency throughout the execution of machine learning workloads. + +Device placement decisions significantly influence both computational performance and memory utilization. Moving tensors between devices introduces latency costs and consumes precious bandwidth on system interconnects. Keeping multiple copies of tensors across different devices can accelerate computation by reducing data movement, but this strategy increases overall memory consumption and requires careful management of consistency between copies. Frameworks must therefore implement sophisticated memory management systems that track tensor locations and orchestrate data movement while considering these tradeoffs. + +These memory management systems maintain a dynamic view of available device memory and implement strategies for efficient data transfer. When operations require tensors that reside on different devices, the framework must either move data or redistribute computation. This decision process integrates deeply with the framework's computational graph execution and operation scheduling. Memory pressure on individual devices, data transfer costs, and computational load all factor into placement decisions. + +The interplay between device placement and memory management extends beyond simple data movement. Frameworks must anticipate future computational needs to prefetch data efficiently, manage memory fragmentation across devices, and handle cases where memory demands exceed device capabilities. This requires close coordination between the memory management system and the operation scheduler, especially in scenarios involving parallel computation across multiple devices or distributed training across machine boundaries. + +#### Specialized Structures + +While tensors are the building blocks of machine learning frameworks, they are not the only structures required for effective system operation. Frameworks rely on a suite of specialized data structures tailored to address the distinct needs of data processing, model parameter management, and execution coordination. These structures ensure that the entire workflow—from raw data ingestion to optimized execution on hardware—proceeds seamlessly and efficiently. + +##### Dataset Structures + +Dataset structures handle the critical task of transforming raw input data into a format suitable for machine learning computations. These structures bridge the gap between diverse data sources and the tensor abstractions required by models, automating the process of reading, parsing, and preprocessing data. + +Dataset structures must support efficient memory usage while dealing with input data far larger than what can fit into memory at once. For example, when training on large image datasets, these structures load images from disk, decode them into tensor-compatible formats, and apply transformations like normalization or augmentation in real time. Frameworks implement mechanisms such as data streaming, caching, and shuffling to ensure a steady supply of preprocessed batches without bottlenecks. + +The design of dataset structures directly impacts training performance. Poorly designed structures can create significant overhead, limiting data throughput to GPUs or other accelerators. In contrast, well-optimized dataset handling can leverage parallelism across CPU cores, disk I/O, and memory transfers to feed accelerators at full capacity. + +In large, multi-system distributed training scenarios, dataset structures also handle coordination between nodes, ensuring that each worker processes a distinct subset of data while maintaining consistency in operations like shuffling. This coordination prevents redundant computation and supports scalability across multiple devices and machines. + +##### Parameter Structures + +Parameter structures store the numerical values that define a machine learning model. These include the weights and biases of neural network layers, along with auxiliary data such as batch normalization statistics and optimizer state. Unlike datasets, which are transient, parameters persist throughout the lifecycle of model training and inference. + +The design of parameter structures must balance efficient storage with rapid access during computation. For example, convolutional neural networks require parameters for filters, fully connected layers, and normalization layers, each with unique shapes and memory alignment requirements. Frameworks organize these parameters into compact representations that minimize memory consumption while enabling fast read and write operations. + +A key challenge for parameter structures is managing memory efficiently across multiple devices [@li2014communication]. During distributed training, frameworks may replicate parameters across GPUs for parallel computation while keeping a synchronized master copy on the CPU. This strategy ensures consistency while reducing the latency of gradient updates. Additionally, parameter structures often leverage memory sharing techniques to minimize duplication, such as storing gradients and optimizer states in place to conserve memory. + +Parameter structures must also adapt to various precision requirements. While training typically uses 32-bit floating-point precision for stability, reduced precision such as 16-bit floating-point or even 8-bit integers is increasingly used for inference and large-scale training. Frameworks implement type casting and mixed-precision management to enable these optimizations without compromising numerical accuracy. + +##### Execution Structures + +Execution structures coordinate how computations are performed on hardware, ensuring that operations execute efficiently while respecting device constraints. These structures work closely with computational graphs, determining how data flows through the system and how memory is allocated for intermediate results. + +One of the primary roles of execution structures is memory management. During training or inference, intermediate computations such as activation maps or gradients can consume significant memory. Execution structures dynamically allocate and deallocate memory buffers to avoid fragmentation and maximize hardware utilization. For example, a deep neural network might reuse memory allocated for activation maps across layers, reducing the overall memory footprint. + +These structures also handle operation scheduling, ensuring that computations are performed in the correct order and with optimal hardware utilization. On GPUs, for instance, execution structures can overlap computation and data transfer operations, hiding latency and improving throughput. When running on multiple devices, they synchronize dependent computations to maintain consistency without unnecessary delays. + +Distributed training introduces additional complexity, as execution structures must manage data and computation across multiple nodes. This includes partitioning computational graphs, synchronizing gradients, and redistributing data as needed. Efficient execution structures minimize communication overhead, allowing distributed systems to scale linearly with additional hardware [@mcmahan2023communicationefficient]. + +### Programming Models + +Programming models define how developers express computations in code. In previous sections, we explored computational graphs and specialized data structures, which together define the computational processes of machine learning frameworks. Computational graphs outline the sequence of operations, such as matrix multiplication or convolution, while data structures like tensors store the numerical values that these operations manipulate. Broadly, these models fall into two categories: symbolic programming and imperative programming. + +#### Symbolic Programming + +Symbolic programming involves constructing abstract representations of computations first and executing them later. This approach aligns naturally with static computational graphs, where the entire structure is defined before any computation occurs. + +For instance, in symbolic programming, variables and operations are represented as symbols. These symbolic expressions are not evaluated until explicitly executed, allowing the framework to analyze and optimize the computation graph before running it. + +Consider the following symbolic programming example: + +```python +# Expressions are constructed but not evaluated +weights = tf.Variable(tf.random.normal([784, 10])) +input = tf.placeholder(tf.float32, [None, 784]) +output = tf.matmul(input, weights) + +# Separate evaluation phase +with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + result = sess.run(output, feed_dict={input: data}) +``` + +This approach enables frameworks to apply global optimizations across the entire computation, making it efficient for deployment scenarios. Additionally, static graphs can be serialized and executed across different environments, enhancing portability. Predefined graphs also facilitate efficient parallel execution strategies. However, debugging can be challenging because errors often surface during execution rather than graph construction, and modifying a static graph dynamically is cumbersome. + +#### Imperative Programming + +Imperative programming takes a more traditional approach, executing operations immediately as they are encountered. This method corresponds to dynamic computational graphs, where the structure evolves dynamically during execution. + +In this programming paradigm, computations are performed directly as the code executes, closely resembling the procedural style of most general-purpose programming languages. For example: + +```python +# Imperative Programming Example +# Each expression evaluates immediately +weights = torch.randn(784, 10) +input = torch.randn(32, 784) +output = input @ weights # Computation occurs now +``` + +The immediate execution model is intuitive and aligns with common programming practices, making it easier to use. Errors can be detected and resolved immediately during execution, simplifying debugging. Dynamic graphs allow for adjustments on-the-fly, making them ideal for tasks requiring variable graph structures, such as reinforcement learning or sequence modeling. However, the creation of dynamic graphs at runtime can introduce computational overhead, and the framework’s ability to optimize the entire computation graph is limited due to the step-by-step execution process. + +#### System Implementation Considerations + +The choice between symbolic and imperative programming models fundamentally influences how ML frameworks manage system-level features such as memory management and optimization strategies. + +In symbolic programming, frameworks can analyze the entire computation graph upfront. This allows for efficient memory allocation strategies. For example, memory can be reused for intermediate results that are no longer needed during later stages of computation. This global view also enables advanced optimization techniques such as operation fusion, automatic differentiation, and hardware-specific kernel selection. These optimizations make symbolic programming highly effective for production environments where performance is critical. + +In contrast, imperative programming makes memory management and optimization more challenging since decisions must be made at runtime. Each operation executes immediately, which prevents the framework from globally analyzing the computation. This trade-off, however, provides developers with greater flexibility and immediate feedback during development. + +##### Development and Debugging + +Symbolic programming requires developers to conceptualize their models as complete computational graphs. This often involves extra steps to inspect intermediate values, as symbolic execution defers computation until explicitly invoked. For example, in TensorFlow 1.x, developers need to use sessions and feed dictionaries to debug intermediate results, which can slow down the development process. + +Imperative programming offers a more straightforward debugging experience. Operations execute immediately, allowing developers to inspect tensor values and shapes as the code runs. This immediate feedback simplifies experimentation and makes it easier to identify and fix issues in the model. As a result, imperative programming is well-suited for rapid prototyping and iterative model development. + +##### Navigating Trade-offs in Practice + +The choice between symbolic and imperative programming models often depends on the specific needs of a project. Symbolic programming excels in scenarios where performance and optimization are critical, such as production deployments. In contrast, imperative programming provides the flexibility and ease of use necessary for research and development. + +Modern frameworks have introduced hybrid approaches that combine the strengths of both paradigms. For instance, TensorFlow 2.x allows developers to write code in an imperative style while converting computations into optimized graph representations for deployment. Similarly, PyTorch provides tools like TorchScript to convert dynamic models into static graphs for production use. These hybrid approaches help bridge the gap between the flexibility of imperative programming and the efficiency of symbolic programming, enabling developers to navigate the trade-offs effectively. + +### Execution Models + +Machine learning frameworks employ various execution paradigms to determine how computations are performed. These paradigms significantly influence the development experience, performance characteristics, and deployment options of ML systems. Let's explore three key execution paradigms: eager execution, graph execution, and just-in-time compilation. + +#### Eager Execution + +Eager execution is the most straightforward and intuitive execution paradigm. In this model, operations are executed immediately as they are called in the code. This approach closely mirrors the way traditional imperative programming languages work, making it familiar to many developers. + +Consider the following example using TensorFlow 2.x, which employs eager execution by default: + +```python +import tensorflow as tf + +x = tf.constant([[1., 2.], [3., 4.]]) +y = tf.constant([[1, 2], [3, 4]]) +z = tf.matmul(x, y) +print(z) +``` + +In this code snippet, each line is executed sequentially. When we create the tensors `x` and `y`, they are immediately instantiated in memory. The matrix multiplication `tf.matmul(x, y)` is computed right away, and the result is stored in `z`. When we print `z`, we see the output of the computation immediately. + +Eager execution offers several advantages. It provides immediate feedback, allowing developers to inspect intermediate values easily. This makes debugging more straightforward and intuitive. It also allows for more dynamic and flexible code structures, as the computation graph can change with each execution. + +However, eager execution has its trade-offs. Since operations are executed immediately, the framework has less opportunity to optimize the overall computation graph. This can lead to lower performance compared to more optimized execution paradigms, especially for complex models or when dealing with large datasets. + +Eager execution is particularly well-suited for research, interactive development, and rapid prototyping. It allows data scientists and researchers to quickly iterate on their ideas and see results immediately. Many modern ML frameworks, including TensorFlow 2.x and PyTorch, use eager execution as their default mode due to its developer-friendly nature. + +#### Graph Execution + +Graph execution, also known as static graph execution, takes a different approach to computing operations in ML frameworks. In this paradigm, developers first define the entire computational graph, and then execute it as a separate step. + +Consider the following example using TensorFlow 1.x style, which employs graph execution: + +```python +import tensorflow.compat.v1 as tf +tf.disable_eager_execution() + +# Define the graph +x = tf.placeholder(tf.float32, shape=(2, 2)) +y = tf.placeholder(tf.float32, shape=(2, 2)) +z = tf.matmul(x, y) + +# Execute the graph +with tf.Session() as sess: + result = sess.run(z, feed_dict={ + x: [[1., 2.], [3., 4.]], + y: [[1, 2], [3, 4]] + }) + print(result) +``` + +In this code snippet, we first define the structure of our computation. The `placeholder` operations create nodes in the graph for input data, while `tf.matmul` creates a node representing matrix multiplication. Importantly, no actual computation occurs during this definition phase. + +The execution of the graph happens when we create a session and call `sess.run()`. At this point, we provide the actual input data through the `feed_dict` parameter. The framework then has the complete graph and can perform optimizations before running the computation. + +Graph execution offers several advantages. It allows the framework to see the entire computation ahead of time, enabling global optimizations that can improve performance, especially for complex models. Once defined, the graph can be easily saved and deployed across different environments, enhancing portability. It's particularly efficient for scenarios where the same computation is repeated many times with different data inputs. + +However, graph execution also has its trade-offs. It requires developers to think in terms of building a graph rather than writing sequential operations, which can be less intuitive. Debugging can be more challenging because errors often don't appear until the graph is executed. Additionally, implementing dynamic computations can be more difficult with a static graph. + +Graph execution is well-suited for production environments where performance and deployment consistency are crucial. It's commonly used in scenarios involving large-scale distributed training or when deploying models to serve predictions in high-throughput applications. + +#### Just-In-Time Compilation + +Just-In-Time (JIT) compilation represents a middle ground between eager execution and graph execution. This paradigm aims to combine the flexibility of eager execution with the performance benefits of graph optimization. + +Let's examine an example using PyTorch's JIT compilation: + +```python +import torch + +@torch.jit.script +def compute(x, y): + return torch.matmul(x, y) + +x = torch.randn(2, 2) +y = torch.randn(2, 2) + +# First call compiles the function +result = compute(x, y) +print(result) + +# Subsequent calls use the optimized version +result = compute(x, y) +print(result) +``` + +In this code snippet, we define a function `compute` and decorate it with `@torch.jit.script`. This decorator tells PyTorch to compile the function using its JIT compiler. The first time `compute` is called, PyTorch analyzes the function, optimizes it, and generates efficient machine code. This compilation process occurs just before the function is executed, hence the term "Just-In-Time". + +Subsequent calls to `compute` use the optimized version, potentially offering significant performance improvements, especially for complex operations or when called repeatedly. + +JIT compilation provides a balance between development flexibility and runtime performance. It allows developers to write code in a natural, eager-style manner while still benefiting from many of the optimizations typically associated with graph execution. + +This approach offers several advantages. It maintains the immediate feedback and intuitive debugging of eager execution, as most of the code still executes eagerly. At the same time, it can deliver performance improvements for critical parts of the computation. JIT compilation can also adapt to the specific data types and shapes being used, potentially resulting in more efficient code than static graph compilation. + +However, JIT compilation also has some considerations. The first execution of a compiled function may be slower due to the overhead of the compilation process. Additionally, some complex Python constructs may not be easily JIT-compiled, requiring developers to be aware of what can be optimized effectively. + +JIT compilation is particularly useful in scenarios where you need both the flexibility of eager execution for development and prototyping, and the performance benefits of compilation for production or large-scale training. It's commonly used in research settings where rapid iteration is necessary but performance is still a concern. + +Many modern ML frameworks incorporate JIT compilation to provide developers with a balance of ease-of-use and performance optimization, as shown in @tbl-mlfm-execmodes. This balance manifests across multiple dimensions - from the learning curve that gradually introduces optimization concepts, to the runtime behavior that combines immediate feedback with performance enhancements. The table highlights how JIT compilation bridges the gap between eager execution's programming simplicity and graph execution's performance benefits, particularly in areas like memory usage and optimization scope. + ++-----------------+---------------------------------+--------------------------------+--------------------------------+ +| Aspect | Eager Execution | Graph Execution | JIT Compilation | ++:================+:================================+:===============================+:===============================+ +| Approach | Computes each operation | Builds entire computation | Analyzes code at runtime, | +| | immediately when encountered | plan first, then executes | creates optimized version | ++-----------------+---------------------------------+--------------------------------+--------------------------------+ +| Memory Usage | Holds intermediate results | Optimizes memory by planning | Adapts memory usage based | +| | throughout computation | complete data flow | on actual execution patterns | ++-----------------+---------------------------------+--------------------------------+--------------------------------+ +| Optimization | Limited to local | Global optimization across | Combines runtime analysis | +| Scope | operation patterns | entire computation chain | with targeted optimizations | ++------------------+--------------------------------+--------------------------------+--------------------------------+ +| Debugging | Examine values at any point | Must set up specific | Initial runs show original | +| Approach | during computation | monitoring points in graph | behavior, then optimizes | ++------------------+--------------------------------+--------------------------------+--------------------------------+ +| Speed vs | Prioritizes flexibility | Prioritizes performance | Balances flexibility and | +| Flexibility | over speed | over flexibility | performance | ++-----------------+---------------------------------+--------------------------------+--------------------------------+ + +: Comparison of execution models in machine learning frameworks. {#tbl-mlfm-execmodes} + +### Core Operations + +Machine learning frameworks provide multiple layers of operations that transform high-level model descriptions into efficient computations on hardware. These operations form a hierarchy: hardware abstraction operations manage the complexity of diverse computing platforms, basic numerical operations implement fundamental mathematical computations, and system-level operations coordinate resources and execution. Understanding this operational hierarchy is essential for comprehending how frameworks translate mathematical models into practical implementations. + +#### Hardware Abstraction Operations + +At the lowest level, hardware abstraction operations provide the foundation for executing computations across diverse computing platforms. These operations isolate higher layers from hardware-specific details while maintaining computational efficiency. The abstraction layer must handle three fundamental aspects: compute kernel management, memory system abstraction, and execution control. + +Compute kernel management involves selecting and dispatching optimal implementations of mathematical operations for different hardware architectures. This requires maintaining multiple implementations of core operations and sophisticated dispatch logic. For example, a matrix multiplication operation might be implemented using AVX-512 vector instructions on modern CPUs, cuBLAS on NVIDIA GPUs, or specialized tensor processing instructions on AI accelerators. The kernel manager must consider input sizes, data layout, and hardware capabilities when selecting implementations. It must also handle fallback paths for when specialized implementations are unavailable or unsuitable. + +Memory system abstractions manage data movement through complex memory hierarchies. These abstractions must handle various memory types (registered, pinned, unified) and their specific access patterns. Data layouts often require transformation between hardware-preferred formats - for instance, between row-major and column-major matrix layouts, or between interleaved and planar image formats. The memory system must also manage alignment requirements, which can vary from 4-byte alignment on CPUs to 128-byte alignment on some accelerators. Additionally, it handles cache coherency issues when multiple execution units access the same data. + +Execution control operations coordinate computation across multiple execution units and memory spaces. This includes managing execution queues, handling event dependencies, and controlling asynchronous operations. Modern hardware often supports multiple execution streams that can operate concurrently. For example, independent GPU streams or CPU thread pools. The execution controller must manage these streams, handle synchronization points, and ensure correct ordering of dependent operations. It must also provide error handling and recovery mechanisms for hardware-specific failures. + +#### Basic Numerical Operations + +Building upon hardware abstractions, frameworks implement fundamental numerical operations that form the building blocks of machine learning computations. These operations must balance mathematical precision with computational efficiency. At their core are General Matrix Multiply (GEMM) operations, which dominate the computational cost of most machine learning workloads. GEMM operations follow the pattern C = αAB + βC, where A, B, and C are matrices, and α and β are scaling factors. + +The implementation of GEMM operations requires sophisticated optimization techniques. These include blocking for cache efficiency, where matrices are divided into smaller tiles that fit in cache memory; loop unrolling to increase instruction-level parallelism; and specialized implementations for different matrix shapes and sparsity patterns. For example, fully-connected neural network layers typically use regular dense GEMM operations, while convolutional layers often employ specialized GEMM variants that exploit input locality patterns. + +Beyond GEMM, frameworks must efficiently implement BLAS operations such as vector addition (AXPY), matrix-vector multiplication (GEMV), and various reduction operations. These operations require different optimization strategies - AXPY operations are typically memory-bandwidth limited, while GEMV operations must balance memory access patterns with computational efficiency. + +Element-wise operations form another critical category, including both basic arithmetic operations (addition, multiplication) and transcendental functions (exponential, logarithm, trigonometric functions). While conceptually simpler than GEMM, these operations present significant optimization opportunities through vectorization and operation fusion. For example, multiple element-wise operations can often be fused into a single kernel to reduce memory bandwidth requirements. The efficiency of these operations becomes particularly important in neural network activation functions and normalization layers, where they process large volumes of data. + +Modern frameworks must also handle operations with varying numerical precision requirements. For example, training often requires 32-bit floating-point precision for numerical stability, while inference can often use reduced precision formats like 16-bit floating-point or even 8-bit integers. Frameworks must therefore provide efficient implementations across multiple numerical formats while maintaining acceptable accuracy. + +#### System-Level Operations + +System-level operations build upon the previously discussed computational graph abstractions, hardware abstractions, and numerical operations to manage overall computation flow and resource utilization. These operations handle three critical aspects: operation scheduling, memory management, and resource optimization. + +Operation scheduling leverages the computational graph structure discussed earlier to determine execution ordering. Building on the static or dynamic graph representation, the scheduler must identify parallelization opportunities while respecting dependencies. The implementation challenges differ between static graphs, where the entire dependency structure is known in advance, and dynamic graphs, where dependencies emerge during execution. The scheduler must also handle advanced execution patterns like conditional operations and loops that create dynamic control flow within the graph structure. + +Memory management implements sophisticated strategies for allocating and deallocating memory resources across the computational graph. Different data types require different management strategies. Model parameters typically persist throughout execution and may require specific memory types for efficient access. Intermediate results have bounded lifetimes defined by the operation graph - for example, activation values needed only during the backward pass. The memory manager employs techniques like reference counting for automatic cleanup, memory pooling to reduce allocation overhead, and workspace management for temporary buffers. It must also handle memory fragmentation, particularly in long-running training sessions where allocation patterns can change over time. + +Resource optimization integrates scheduling and memory decisions to maximize performance within system constraints. A key optimization is gradient checkpointing[^defn-gradient-checkpointing], where some intermediate results are discarded and recomputed rather than stored, trading computation time for memory savings. The optimizer must also manage concurrent execution streams, balancing load across available compute units while respecting dependencies. For operations with multiple possible implementations, it selects between alternatives based on runtime conditions - for instance, choosing between matrix multiplication algorithms based on matrix shapes and system load. + +[^defn-gradient-checkpointing]: **Gradient checkpointing:** A memory-saving optimization technique that stores a limited set of intermediate activations during the forward pass and recomputes the others during the backward pass to reduce memory usage. + +Together, these operational layers build upon the computational graph foundation to execute machine learning workloads efficiently while abstracting implementation complexity from model developers. The interaction between these layers determines overall system performance and sets the foundation for advanced optimization techniques discussed in subsequent chapters. + +## Framework Components + +Machine learning frameworks organize their fundamental capabilities into distinct components that work together to provide a complete development and deployment environment. These components create layers of abstraction that make frameworks both usable for high-level model development and efficient for low-level execution. Understanding how these components interact helps developers choose and use frameworks effectively. + +### APIs and Abstractions + +The API layer of machine learning frameworks provides the primary interface through which developers interact with the framework's capabilities. This layer must balance multiple competing demands: it must be intuitive enough for rapid development, flexible enough to support diverse use cases, and efficient enough to enable high-performance implementations. + +Modern framework APIs typically implement multiple levels of abstraction. At the lowest level, they provide direct access to tensor operations and computational graph construction. These low-level APIs expose the fundamental operations discussed in the previous section, allowing fine-grained control over computation. For example, frameworks like PyTorch and TensorFlow offer such low-level interfaces, enabling researchers to define custom computations and explore novel algorithms [@paszke2019pytorch; @Abadi_2016]. + +```python +# Low-level API example +import torch + +# Manual tensor operations +x = torch.randn(2, 3) +w = torch.randn(3, 4) +b = torch.randn(4) +y = torch.matmul(x, w) + b + +# Manual gradient computation +y.backward(torch.ones_like(y)) +``` + +Building on these primitives, frameworks implement higher-level APIs that package common patterns into reusable components. Neural network layers represent a classic example---while a convolution operation could be implemented manually using basic tensor operations, frameworks provide pre-built layer abstractions that handle the implementation details. This approach is exemplified by libraries such as PyTorch's `torch.nn` and TensorFlow's Keras API, which enable efficient and user-friendly model development [@chollet2018keras]. + + +```{.python} +# Mid-level API example using nn modules +import torch.nn as nn + +class SimpleNet(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(3, 64, kernel_size=3) + self.fc = nn.Linear(64, 10) + + def forward(self, x): + x = self.conv(x) + x = torch.relu(x) + x = self.fc(x) + return x +``` + +At the highest level, frameworks often provide model-level abstractions that automate common workflows. For example, the Keras API provides a highly abstract interface that hides most implementation details: + +```{.python} +# High-level API example using Keras +from tensorflow import keras + +model = keras.Sequential([ + keras.layers.Conv2D(64, 3, activation='relu', input_shape=(32, 32, 3)), + keras.layers.Flatten(), + keras.layers.Dense(10) +]) + +# Automated training workflow +model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') +model.fit(train_data, train_labels, epochs=10) +``` + +The organization of these API layers reflects fundamental trade-offs in framework design. Lower-level APIs provide maximum flexibility but require more expertise to use effectively. Higher-level APIs improve developer productivity but may constrain implementation choices. Framework APIs must therefore provide clear paths between abstraction levels, allowing developers to mix different levels of abstraction as needed for their specific use cases.## Framework Components + +Machine learning frameworks organize their fundamental capabilities into distinct components that work together to provide a complete development and deployment environment. These components create layers of abstraction that make frameworks both usable for high-level model development and efficient for low-level execution. Understanding how these components interact helps developers choose and use frameworks effectively. + +### Core Libraries + +At the heart of every machine learning framework lies a set of core libraries, forming the foundation upon which all other components are built. These libraries provide the essential building blocks for machine learning operations, implementing fundamental tensor operations that serve as the backbone of numerical computations. Heavily optimized for performance, these operations often leverage low-level programming languages and hardware-specific optimizations to ensure efficient execution of tasks like matrix multiplication, a cornerstone of neural network computations. + +Alongside these basic operations, core libraries implement automatic differentiation capabilities, enabling the efficient computation of gradients for complex functions. This feature is crucial for the backpropagation algorithm that powers most neural network training. The implementation often involves intricate graph manipulation and symbolic computation techniques, abstracting away the complexities of gradient calculation from the end-user. + +Building upon these fundamental operations, core libraries typically provide pre-implemented neural network layers such as convolutional, recurrent, and attention mechanisms. These ready-to-use components save developers from reinventing the wheel for common model architectures, allowing them to focus on higher-level model design rather than low-level implementation details. Similarly, optimization algorithms like various flavors of gradient descent are provided out-of-the-box, further streamlining the model development process. + +Here is a simplified example of how these core components might be used in practice: + +```{.python} +import torch +import torch.nn as nn + +# Create a simple neural network +model = nn.Sequential( + nn.Linear(10, 20), + nn.ReLU(), + nn.Linear(20, 1) +) + +# Define loss function and optimizer +loss_fn = nn.MSELoss() +optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + +# Forward pass, compute loss, and backward pass +x = torch.randn(32, 10) +y = torch.randn(32, 1) +y_pred = model(x) +loss = loss_fn(y_pred, y) +loss.backward() +optimizer.step() +``` + +This example demonstrates how core libraries provide high-level abstractions for model creation, loss computation, and optimization, while handling low-level details internally. + +### Extensions and Plugins + +While core libraries offer essential functionality, the true power of modern machine learning frameworks often lies in their extensibility. Extensions and plugins expand the capabilities of frameworks, allowing them to address specialized needs and leverage cutting-edge research. Domain-specific libraries, for instance, cater to particular areas like computer vision or natural language processing, providing pre-trained models, specialized data augmentation techniques, and task-specific layers. + +Hardware acceleration plugins play an important role in performance optimization as it enables frameworks to take advantage of specialized hardware like GPUs or TPUs. These plugins dramatically speed up computations and allow seamless switching between different hardware backends, a key feature for scalability and flexibility in modern machine learning workflows. + +As models and datasets grow in size and complexity, distributed computing extensions also become important. These tools enable training across multiple devices or machines, handling complex tasks like data parallelism, model parallelism, and synchronization between compute nodes. This capability is essential for researchers and companies tackling large-scale machine learning problems. + +Complementing these computational tools are visualization and experiment tracking extensions. Visualization tools provide invaluable insights into the training process and model behavior, displaying real-time metrics and even offering interactive debugging capabilities. Experiment tracking extensions help manage the complexity of machine learning research, allowing systematic logging and comparison of different model configurations and hyperparameters. + +### Development Tools + +The ecosystem of development tools surrounding a machine learning framework further enhances its effectiveness and adoption. Interactive development environments, such as Jupyter notebooks, have become nearly ubiquitous in machine learning workflows, allowing for rapid prototyping and seamless integration of code, documentation, and outputs. Many frameworks provide custom extensions for these environments to enhance the development experience. + +Debugging and profiling tools address the unique challenges presented by machine learning models. Specialized debuggers allow developers to inspect the internal state of models during training and inference, while profiling tools identify bottlenecks in model execution, guiding optimization efforts. These tools are essential for developing efficient and reliable machine learning systems. + +As projects grow in complexity, version control integration becomes increasingly important. Tools that allow versioning of not just code, but also model weights, hyperparameters, and training data, help manage the iterative nature of model development. This comprehensive versioning approach ensures reproducibility and facilitates collaboration in large-scale machine learning projects. + +Finally, deployment utilities bridge the gap between development and production environments. These tools handle tasks like model compression, conversion to deployment-friendly formats, and integration with serving infrastructure, streamlining the process of moving models from experimental settings to real-world applications. + +## System Integration + +System integration is about implementing machine learning frameworks in real-world environments. This section explores how ML frameworks integrate with broader software and hardware ecosystems, addressing the challenges and considerations at each level of the integration process. + +### Hardware Integration + +Effective hardware integration is crucial for optimizing the performance of machine learning models. Modern ML frameworks must adapt to a diverse range of computing environments, from high-performance GPU clusters to resource-constrained edge devices. + +For GPU acceleration, frameworks like TensorFlow and PyTorch provide robust support, allowing seamless utilization of NVIDIA's CUDA platform. This integration enables significant speedups in both training and inference tasks. Similarly, support for Google's TPUs in TensorFlow allows for even further acceleration of specific workloads. + +In distributed computing scenarios, frameworks must efficiently manage multi-device and multi-node setups. This involves strategies for data parallelism, where the same model is replicated across devices, and model parallelism, where different parts of the model are distributed across hardware units. Frameworks like Horovod have emerged to simplify distributed training across different backend frameworks. + +For edge deployment, frameworks are increasingly offering lightweight versions optimized for mobile and IoT devices. TensorFlow Lite and PyTorch Mobile, for instance, provide tools for model compression and optimization, ensuring efficient execution on devices with limited computational resources and power constraints. + +### Software Stack + +Integrating ML frameworks into existing software stacks presents unique challenges and opportunities. A key consideration is how the ML system interfaces with data processing pipelines. Frameworks often provide connectors to popular big data tools like Apache Spark or Apache Beam, allowing seamless data flow between data processing systems and ML training environments. + +Containerization technologies like Docker have become essential in ML workflows, ensuring consistency between development and production environments. Kubernetes has emerged as a popular choice for orchestrating containerized ML workloads, providing scalability and manageability for complex deployments. + +ML frameworks must also interface with other enterprise systems such as databases, message queues, and web services. For instance, TensorFlow Serving provides a flexible, high-performance serving system for machine learning models, which can be easily integrated into existing microservices architectures. + +### Deployment Considerations + +Deploying ML models to production environments involves several critical considerations. Model serving strategies must balance performance, scalability, and resource efficiency. Approaches range from batch prediction for large-scale offline processing to real-time serving for interactive applications. + +Scaling ML systems to meet production demands often involves techniques like horizontal scaling of inference servers, caching of frequent predictions, and load balancing across multiple model versions. Frameworks like TensorFlow Serving and TorchServe provide built-in solutions for many of these scaling challenges. + +Monitoring and logging are crucial for maintaining ML systems in production. This includes tracking model performance metrics, detecting concept drift, and logging prediction inputs and outputs for auditing purposes. Tools like Prometheus and Grafana are often integrated with ML serving systems to provide comprehensive monitoring solutions. + +### Workflow Orchestration + +Managing end-to-end ML pipelines requires orchestrating multiple stages, from data preparation and model training to deployment and monitoring. MLOps practices have emerged to address these challenges, bringing DevOps principles to machine learning workflows. + +Continuous Integration and Continuous Deployment (CI/CD) practices are being adapted for ML workflows. This involves automating model testing, validation, and deployment processes. Tools like Jenkins or GitLab CI can be extended with ML-specific stages to create robust CI/CD pipelines for machine learning projects. + +Automated model retraining and updating is another critical aspect of ML workflow orchestration. This involves setting up systems to automatically retrain models on new data, evaluate their performance, and seamlessly update production models when certain criteria are met. Frameworks like Kubeflow provide end-to-end ML pipelines that can automate many of these processes. + +Version control for ML assets, including data, model architectures, and hyperparameters, is essential for reproducibility and collaboration. Tools like DVC (Data Version Control) and MLflow have emerged to address these ML-specific version control needs. + +## Major Frameworks + +As we have seen earlier, machine learning frameworks are complicated. Over the years, several machine learning frameworks have emerged, each with its unique strengths and ecosystem, but few have remained as industry standards. Here we examine the mature and major players in the field, starting with a comprehensive look at TensorFlow, followed by PyTorch, JAX, and other notable frameworks. + +### TF Ecosystem + +TensorFlow was developed by the Google Brain team and was released as an open-source software library on November 9, 2015. It was designed for numerical computation using data flow graphs and has since become popular for a wide range of machine learning applications. + +TensorFlow is a training and inference framework that provides built-in functionality to handle everything from model creation and training to deployment, as shown in @fig-tensorflow-architecture. Since its initial development, the TensorFlow ecosystem has grown to include many different "varieties" of TensorFlow, each intended to allow users to support ML on different platforms. In this section, we will mainly discuss only the core package. + +1. [TensorFlow Core](https://www.tensorflow.org/tutorials): primary package that most developers engage with. It provides a comprehensive, flexible platform for defining, training, and deploying machine learning models. It includes [tf.keras](https://www.tensorflow.org/guide/keras) as its high-level API. + +2. [TensorFlow Lite](): designed for deploying lightweight models on mobile, embedded, and edge devices. It offers tools to convert TensorFlow models to a more compact format suitable for limited-resource devices and provides optimized pre-trained models for mobile. + +3. [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers): designed for running machine learning models on microcontrollers with minimal resources. It operates without the need for operating system support, standard C or C++ libraries, or dynamic memory allocation, using only a few kilobytes of memory. + +4. [TensorFlow.js](https://www.tensorflow.org/js): JavaScript library that allows training and deployment of machine learning models directly in the browser or on Node.js. It also provides tools for porting pre-trained TensorFlow models to the browser-friendly format. + +5. [TensorFlow on Edge Devices (Coral)](https://developers.googleblog.com/2019/03/introducing-coral-our-platform-for.html): platform of hardware components and software tools from Google that allows the execution of TensorFlow models on edge devices, leveraging Edge TPUs for acceleration. + +6. [TensorFlow Federated (TFF)](https://www.tensorflow.org/federated): framework for machine learning and other computations on decentralized data. TFF facilitates federated learning, allowing model training across many devices without centralizing the data. + +7. [TensorFlow Graphics](https://www.tensorflow.org/graphics): library for using TensorFlow to carry out graphics-related tasks, including 3D shapes and point clouds processing, using deep learning. + +8. [TensorFlow Hub](https://www.tensorflow.org/hub): repository of reusable machine learning model components to allow developers to reuse pre-trained model components, facilitating transfer learning and model composition. + +9. [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving): framework designed for serving and deploying machine learning models for inference in production environments. It provides tools for versioning and dynamically updating deployed models without service interruption. + +10. [TensorFlow Extended (TFX)](https://www.tensorflow.org/tfx): end-to-end platform designed to deploy and manage machine learning pipelines in production settings. TFX encompasses data validation, preprocessing, model training, validation, and serving components. + +![Architecture overview of TensorFlow 2.0. Source: [Tensorflow.](https://blog.tensorflow.org/2019/01/whats-coming-in-tensorflow-2-0.html)](images/png/tensorflow.png){#fig-tensorflow-architecture} + +### PyTorch + +PyTorch, developed by Facebook's AI Research lab, has gained significant traction in the machine learning community, particularly among researchers and academics. Its design philosophy emphasizes ease of use, flexibility, and dynamic computation, which aligns well with the iterative nature of research and experimentation. + +PyTorch's architecture lies its dynamic computational graph system. Unlike the static graphs used in earlier versions of TensorFlow, PyTorch builds the computational graph on-the-fly during execution. This approach, often referred to as "define-by-run," allows for more intuitive model design and easier debugging that we discussed earlier. Moreover, developers can use standard Python control flow statements within their models, and the graph structure can change from iteration to iteration. This flexibility is particularly advantageous when working with variable-length inputs or complex, dynamic neural network architectures. + +PyTorch's eager execution mode is tightly coupled with its dynamic graph approach. Operations are executed immediately as they are called, rather than being deferred for later execution in a static graph. This immediate execution facilitates easier debugging and allows for more natural integration with Python's native debugging tools. The eager execution model aligns closely with PyTorch's imperative programming style, which many developers find more intuitive and Pythonic. + +PyTorch's fundamental data structure is the tensor, similar to TensorFlow and other frameworks discussed in earlier sections. PyTorch tensors are conceptually equivalent to multi-dimensional arrays and can be manipulated using a rich set of operations. The framework provides seamless integration with CUDA, much like TensorFlow, enabling efficient GPU acceleration for tensor computations. PyTorch's autograd system automatically tracks all operations performed on tensors, facilitating automatic differentiation for gradient-based optimization algorithms. + +### JAX + +JAX, developed by Google Research, is a newer entrant in the field of machine learning frameworks. Unlike TensorFlow and PyTorch, which were primarily designed for deep learning, JAX focuses on high-performance numerical computing and advanced machine learning research. Its design philosophy centers around functional programming principles and composition of transformations, offering a fresh perspective on building and optimizing machine learning systems. + +JAX is built as a NumPy-like library with added capabilities for automatic differentiation and just-in-time (JIT) compilation. This foundation makes JAX feel familiar to researchers accustomed to scientific computing in Python, while providing powerful tools for optimization and acceleration. Where TensorFlow uses static computational graphs and PyTorch employs dynamic ones, JAX takes a different approach altogether---a system for transforming numerical functions. + +One of JAX's key features is its powerful automatic differentiation system. Unlike TensorFlow's static graph approach or PyTorch's dynamic computation, JAX can differentiate native Python and NumPy functions, including those with loops, branches, and recursion. This capability extends beyond simple scalar-to-scalar functions, allowing for complex transformations like vectorization and JIT compilation. This flexibility is particularly valuable for researchers exploring novel machine learning techniques and architectures. + +JAX leverages XLA (Accelerated Linear Algebra) for just-in-time compilation, similar to TensorFlow but with a more central role in its operation. This allows JAX to optimize and compile Python code for various hardware accelerators, including GPUs and TPUs. In contrast to PyTorch's eager execution and TensorFlow's graph optimization, JAX's approach can lead to significant performance improvements, especially for complex computational patterns. + +Where TensorFlow and PyTorch primarily use object-oriented and imperative programming models, JAX embraces functional programming. This approach encourages the use of pure functions and immutable data, which can lead to more predictable and easier-to-optimize code. It's a significant departure from the stateful models common in other frameworks and can require a shift in thinking for developers accustomed to TensorFlow or PyTorch. + +JAX introduces a set of composable function transformations that set it apart from both TensorFlow and PyTorch. These include automatic differentiation (grad), just-in-time compilation (jit), automatic vectorization (vmap), and parallel execution across multiple devices (pmap). These transformations can be composed, allowing for powerful and flexible operations that are not as straightforward in other frameworks. + +### Comparison + +@tbl-mlfm-comparison provides a concise comparison of three major machine learning frameworks: TensorFlow, PyTorch, and JAX. These frameworks, while serving similar purposes, exhibit fundamental differences in their design philosophies and technical implementations. + ++---------------------------+----------------------------------+------------------+----------------------------+ +| Aspect | TensorFlow | PyTorch | JAX | ++:==========================+:=================================+:=================+:===========================+ +| Graph Type | Static (1.x), Dynamic (2.x) | Dynamic | Functional transformations | ++---------------------------+----------------------------------+------------------+----------------------------+ +| Programming Model | Imperative (2.x), Symbolic (1.x) | Imperative | Functional | ++---------------------------+----------------------------------+------------------+----------------------------+ +| Core Data Structure | Tensor (mutable) | Tensor (mutable) | Array (immutable) | ++---------------------------+----------------------------------+------------------+----------------------------+ +| Execution Mode | Eager (2.x default), Graph | Eager | Just-in-time compilation | ++---------------------------+----------------------------------+------------------+----------------------------+ +| Automatic Differentiation | Reverse mode | Reverse mode | Forward and Reverse mode | ++---------------------------+----------------------------------+------------------+----------------------------+ +| Hardware Acceleration | CPU, GPU, TPU | CPU, GPU | CPU, GPU, TPU | ++---------------------------+----------------------------------+------------------+----------------------------+ + +: Core characteristics of major machine learning frameworks. {#tbl-mlfm-comparison .hover .striped} + +## Framework Specialization + +Machine Learning (ML) frameworks have evolved significantly to meet the diverse needs of different computational environments. As ML applications expand beyond traditional data centers to encompass edge devices, mobile platforms, and even tiny microcontrollers, the need for specialized frameworks has become increasingly apparent. + +Framework specialization refers to the process of tailoring ML frameworks to optimize performance, efficiency, and functionality for specific deployment environments. This specialization is crucial because the computational resources, power constraints, and use cases vary dramatically across different platforms. + +The [Open Neural Network Exchange (ONNX)](https://onnx.ai/) format plays a vital role in framework interoperability across these specialized environments. ONNX provides a standardized representation for ML models, allowing them to move between different frameworks and deployment targets. This standardization helps bridge the gap between framework specializations, enabling models trained in one environment to be optimized and deployed in another. + +The primary deployment environments that drive framework specialization are: + +1. Cloud ML: High-performance servers with abundant computational resources +2. Edge ML: Devices with moderate computing power, often requiring real-time processing +3. Mobile ML: Smartphones and tablets with varying capabilities and energy constraints +4. Tiny ML: Highly constrained devices such as microcontrollers with minimal resources + +Each of these environments presents unique challenges that influence framework design. Cloud frameworks prioritize scalability and distributed computing. Edge frameworks focus on low-latency inference and adaptability to diverse hardware. Mobile frameworks emphasize energy efficiency and integration with device-specific features. TinyML frameworks specialize in extreme resource optimization for severely constrained environments. + +In the following sections, we will explore how ML frameworks adapt to each of these environments. We will examine the specific techniques and design choices that enable frameworks to address the unique challenges of each domain, highlighting the trade-offs and optimizations that characterize framework specialization. + +### Cloud ML Frameworks + +Cloud ML frameworks are sophisticated software infrastructures designed to leverage the vast computational resources available in cloud environments. These frameworks specialize in three primary areas: distributed computing architectures, management of large-scale data and models, and integration with cloud-native services. + +Distributed computing is a fundamental specialization of cloud ML frameworks. These frameworks implement advanced strategies for partitioning and coordinating computational tasks across multiple machines or graphics processing units (GPUs). This capability is essential for training large-scale models on massive datasets. Both TensorFlow and PyTorch, two leading cloud ML frameworks, offer robust support for distributed computing. TensorFlow's graph-based approach (in its 1.x version) was particularly well-suited for distributed execution, while PyTorch's dynamic computational graph allows for more flexible distributed training strategies. + +The ability to handle large-scale data and models is another key specialization. Cloud ML frameworks are optimized to work with datasets and models that far exceed the capacity of single machines. This specialization is reflected in the core data structures of these frameworks. For instance, both TensorFlow and PyTorch use mutable Tensor objects as their primary data structure, allowing for efficient in-place operations on large datasets. JAX, a more recent framework, uses immutable arrays, which can provide benefits in terms of functional programming paradigms and optimization opportunities in distributed settings. + +Integration with cloud-native services is the third major specialization area. This integration enables automated resource scaling, seamless access to cloud storage, and incorporation of cloud-based monitoring and logging systems. The execution modes of different frameworks play a role here. TensorFlow 2.x and PyTorch both default to eager execution, which allows for easier integration with cloud services and debugging. JAX's just-in-time compilation offers potential performance benefits in cloud environments by optimizing computations for specific hardware. + +Hardware acceleration is an important aspect of cloud ML frameworks. All major frameworks support CPU and GPU execution, with TensorFlow and JAX also offering native support for Google's TPU. [NVIDIA's TensorRT](https://developer.nvidia.com/tensorrt) is an optimization tool dedicated for GPU-based inference, providing sophisticated optimizations like layer fusion, precision calibration[^defn-precision-calibration], and kernel auto-tuning to maximize throughput on NVIDIA GPUs. These hardware acceleration options allow cloud ML frameworks to efficiently utilize the diverse computational resources available in cloud environments. + +[^defn-precision-calibration]: A process of adjusting computations to use reduced numerical precision, balancing performance improvements with acceptable losses in accuracy. + +The automatic differentiation capabilities of these frameworks are particularly important in cloud settings where complex models with millions of parameters are common. While TensorFlow and PyTorch primarily use reverse-mode differentiation, JAX's support for both forward and reverse-mode differentiation can offer advantages in certain large-scale optimization scenarios. + +These specializations enable cloud ML frameworks to fully utilize the scalability and computational power of cloud infrastructure. However, this capability comes with increased complexity in deployment and management, often requiring specialized knowledge to fully leverage these frameworks. The focus on scalability and integration makes cloud ML frameworks particularly suitable for large-scale research projects, enterprise-level ML applications, and scenarios requiring massive computational resources. + +### Edge ML Frameworks + +Edge ML frameworks are specialized software tools designed to facilitate machine learning operations in edge computing environments, characterized by proximity to data sources, stringent latency requirements, and limited computational resources. Examples of popular edge ML frameworks include [TensorFlow Lite](https://www.tensorflow.org/lite) and [Edge Impulse](https://www.edgeimpulse.com). The specialization of these frameworks addresses three primary challenges: real-time inference optimization, adaptation to heterogeneous hardware, and resource-constrained operation. + +Real-time inference optimization is a critical feature of edge ML frameworks. This often involves leveraging different execution modes and graph types. For instance, while TensorFlow Lite (the edge-focused version of TensorFlow) uses a static graph approach to optimize inference, frameworks like [PyTorch Mobile](https://pytorch.org/mobile/home/) maintain a dynamic graph capability, allowing for more flexible model structures at the cost of some performance. The choice between static and dynamic graphs in edge frameworks often represents a trade-off between optimization potential and model flexibility. + +Adaptation to heterogeneous hardware is crucial for edge deployments. Edge ML frameworks extend the hardware acceleration capabilities of their cloud counterparts but with a focus on edge-specific hardware. For instance, TensorFlow Lite supports acceleration on mobile GPUs and edge TPUs, while frameworks like [ARM's Compute Library](https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides) optimize for ARM-based processors. This specialization often involves custom operator implementations and low-level optimizations specific to edge hardware. + +Operating within resource constraints is another aspect of edge ML framework specialization. This is reflected in the core data structures and execution models of these frameworks. For instance, many edge frameworks use quantized tensors as their primary data structure, representing values with reduced precision (e.g., 8-bit integers instead of 32-bit floats) to decrease memory usage and computational demands. The automatic differentiation capabilities, while crucial for training in cloud environments, are often stripped down or removed entirely in edge frameworks to reduce model size and improve inference speed. + +Edge ML frameworks also often include features for model versioning and updates, allowing for the deployment of new models with minimal system downtime. Some frameworks support limited on-device learning, enabling models to adapt to local data without compromising data privacy. + +The specializations of edge ML frameworks collectively enable high-performance inference in resource-constrained environments. This capability expands the potential applications of AI in areas with limited cloud connectivity or where real-time processing is crucial. However, effective utilization of these frameworks requires careful consideration of target hardware specifications and application-specific requirements, necessitating a balance between model accuracy and resource utilization. + +### Mobile ML Frameworks + +Mobile ML frameworks are specialized software tools designed for deploying and executing machine learning models on smartphones and tablets. Examples include TensorFlow Lite and [Apple's Core ML](https://developer.apple.com/documentation/coreml/). These frameworks address the unique challenges of mobile environments, including limited computational resources, constrained power consumption, and diverse hardware configurations. The specialization of mobile ML frameworks primarily focuses on on-device inference optimization, energy efficiency, and integration with mobile-specific hardware and sensors. + +On-device inference optimization in mobile ML frameworks often involves a careful balance between graph types and execution modes. For instance, TensorFlow Lite, also a popular mobile ML framework, uses a static graph approach to optimize inference performance. This contrasts with the dynamic graph capability of PyTorch Mobile, which offers more flexibility at the cost of some performance. The choice between static and dynamic graphs in mobile frameworks represents a trade-off between optimization potential and model adaptability, crucial in the diverse and changing mobile environment. + +The core data structures in mobile ML frameworks are optimized for efficient memory usage and computation. While cloud-based frameworks like TensorFlow and PyTorch use mutable tensors, mobile frameworks often employ more specialized data structures. For example, many mobile frameworks use quantized tensors, representing values with reduced precision (e.g., 8-bit integers instead of 32-bit floats) to decrease memory footprint and computational demands. This specialization is critical given the limited RAM and processing power of mobile devices. + +Energy efficiency, a paramount concern in mobile environments, influences the design of execution modes in mobile ML frameworks. Unlike cloud frameworks that may use eager execution for ease of development, mobile frameworks often prioritize graph-based execution for its potential energy savings. For instance, Apple's Core ML uses a compiled model approach, converting ML models into a form that can be efficiently executed by iOS devices, optimizing for both performance and energy consumption. + +Integration with mobile-specific hardware and sensors is another key specialization area. Mobile ML frameworks extend the hardware acceleration capabilities of their cloud counterparts but with a focus on mobile-specific processors. For example, TensorFlow Lite can leverage mobile GPUs and neural processing units (NPUs) found in many modern smartphones. Qualcomm's Neural Processing SDK is designed to efficiently utilize the AI accelerators present in Snapdragon SoCs. This hardware-specific optimization often involves custom operator implementations and low-level optimizations tailored for mobile processors. + +Automatic differentiation, while crucial for training in cloud environments, is often minimized or removed entirely in mobile frameworks to reduce model size and improve inference speed. Instead, mobile ML frameworks focus on efficient inference, with model updates typically performed off-device and then deployed to the mobile application. + +Mobile ML frameworks also often include features for model updating and versioning, allowing for the deployment of improved models without requiring full app updates. Some frameworks support limited on-device learning, enabling models to adapt to user behavior or environmental changes without compromising data privacy. + +The specializations of mobile ML frameworks collectively enable the deployment of sophisticated ML models on resource-constrained mobile devices. This expands the potential applications of AI in mobile environments, ranging from real-time image and speech recognition to personalized user experiences. However, effectively utilizing these frameworks requires careful consideration of the target device capabilities, user experience requirements, and privacy implications, necessitating a balance between model performance and resource utilization. + +### TinyML Frameworks + +TinyML frameworks are specialized software infrastructures designed for deploying machine learning models on extremely resource-constrained devices, typically microcontrollers and low-power embedded systems. These frameworks address the severe limitations in processing power, memory, and energy consumption characteristic of tiny devices. The specialization of TinyML frameworks primarily focuses on extreme model compression, optimizations for severely constrained environments, and integration with microcontroller-specific architectures. + +Extreme model compression in TinyML frameworks takes the quantization techniques mentioned in mobile and edge frameworks to their logical conclusion. While mobile frameworks might use 8-bit quantization, TinyML often employs even more aggressive techniques, such as 4-bit, 2-bit, or even 1-bit (binary) representations of model parameters. Frameworks like TensorFlow Lite Micro exemplify this approach [@david2021tensorflow], pushing the boundaries of model compression to fit within the kilobytes of memory available on microcontrollers. + +The execution model in TinyML frameworks is highly specialized. Unlike the dynamic graph capabilities seen in some cloud and mobile frameworks, TinyML frameworks almost exclusively use static, highly optimized graphs. The just-in-time compilation approach seen in frameworks like JAX is typically not feasible in TinyML due to memory constraints. Instead, these frameworks often employ ahead-of-time compilation techniques to generate highly optimized, device-specific code. + +Memory management in TinyML frameworks is far more constrained than in other environments. While edge and mobile frameworks might use dynamic memory allocation, TinyML frameworks like [uTensor](https://utensor.github.io) often rely on static memory allocation to avoid runtime overhead and fragmentation. This approach requires careful planning of the memory layout at compile time, a stark contrast to the more flexible memory management in cloud-based frameworks. + +Hardware integration in TinyML frameworks is highly specific to microcontroller architectures. Unlike the general GPU support seen in cloud frameworks or the mobile GPU/NPU support in mobile frameworks, TinyML frameworks often provide optimizations for specific microcontroller instruction sets. For example, ARM's CMSIS-NN [@lai2018cmsis] provides optimized neural network kernels for Cortex-M series microcontrollers, which are often integrated into TinyML frameworks. + +The concept of automatic differentiation, central to cloud-based frameworks and present to some degree in edge and mobile frameworks, is typically absent in TinyML frameworks. The focus is almost entirely on inference, with any learning or model updates usually performed off-device due to the severe computational constraints. + +TinyML frameworks also specialize in power management to a degree not seen in other ML environments. Features like duty cycling and ultra-low-power wake-up capabilities are often integrated directly into the ML pipeline, enabling always-on sensing applications that can run for years on small batteries. + +The extreme specialization of TinyML frameworks enables ML deployments in previously infeasible environments, from smart dust sensors to implantable medical devices. However, this specialization comes with significant trade-offs in model complexity and accuracy, requiring careful consideration of the balance between ML capabilities and the severe resource constraints of target devices. + +## Conclusion + +AI frameworks have evolved from basic numerical libraries into sophisticated software systems that shape how we develop and deploy machine learning applications. The progression from early numerical computing to modern deep learning frameworks demonstrates the field's rapid technological advancement. + +Modern frameworks like TensorFlow, PyTorch, and JAX implement distinct approaches to common challenges in machine learning development. Each framework offers varying tradeoffs between ease of use, performance, and flexibility. TensorFlow emphasizes production deployment, PyTorch focuses on research and experimentation, while JAX prioritizes functional programming patterns. + +The specialization of frameworks into cloud, edge, mobile, and tiny ML implementations reflects the diverse requirements of machine learning applications. Cloud frameworks optimize for scalability and distributed computing. Edge and mobile frameworks prioritize model efficiency and reduced resource consumption. TinyML frameworks target constrained environments with minimal computing resources. + +Understanding framework architecture, from tensor operations to execution models, enables developers to select appropriate tools for specific use cases, optimize application performance, debug complex computational graphs, and deploy models across different computing environments. + +The continuing evolution of AI frameworks will likely focus on improving developer productivity, hardware acceleration, and deployment flexibility. These advancements will shape how machine learning systems are built and deployed across increasingly diverse computing environments. \ No newline at end of file diff --git a/contents/core/frameworks/images/png/fm_building_blocks.png b/contents/core/frameworks/images/png/fm_building_blocks.png index 0d868dff..b4abbe91 100644 Binary files a/contents/core/frameworks/images/png/fm_building_blocks.png and b/contents/core/frameworks/images/png/fm_building_blocks.png differ