diff --git a/.gitattributes b/.gitattributes index 7e6edeb266027732f77d03b28bf33d27a540d311..2ec3d3d0c51afa249e5795da114f7cf6dbc14780 100644 --- a/.gitattributes +++ b/.gitattributes @@ -65,3 +65,14 @@ graphrag-ollama/lib/python3.12/site-packages/regex/_regex.cpython-312-x86_64-lin graphrag-ollama/lib/python3.12/site-packages/sqlalchemy/cyextension/collections.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text graphrag-ollama/lib/python3.12/site-packages/tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text graphrag-ollama/lib/python3.12/site-packages/yaml/_yaml.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/_flight.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/lib.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/libarrow.so.1800 filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/libarrow_acero.so.1800 filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/libarrow_dataset.so.1800 filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/libarrow_flight.so.1800 filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/libarrow_python.so filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/libarrow_substrait.so.1800 filter=lfs diff=lfs merge=lfs -text +graphrag-ollama/lib/python3.12/site-packages/pyarrow/libparquet.so.1800 filter=lfs diff=lfs merge=lfs -text diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/INSTALLER b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/LICENSE.txt b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bb1330a1002b78e748296b4be96a72f3fb67b4e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/LICENSE.txt @@ -0,0 +1,2261 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +-------------------------------------------------------------------------------- + +src/arrow/util (some portions): Apache 2.0, and 3-clause BSD + +Some portions of this module are derived from code in the Chromium project, +copyright (c) Google inc and (c) The Chromium Authors and licensed under the +Apache 2.0 License or the under the 3-clause BSD license: + + Copyright (c) 2013 The Chromium Authors. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This project includes code from Daniel Lemire's FrameOfReference project. + +https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp +https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py + +Copyright: 2013 Daniel Lemire +Home page: http://lemire.me/en/ +Project page: https://github.com/lemire/FrameOfReference +License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This project includes code from the TensorFlow project + +Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +This project includes code from the NumPy project. + +https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910 + +https://github.com/numpy/numpy/blob/68fd82271b9ea5a9e50d4e761061dfcca851382a/numpy/core/src/multiarray/datetime.c + +Copyright (c) 2005-2017, NumPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the NumPy Developers nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This project includes code from the Boost project + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This project includes code from the FlatBuffers project + +Copyright 2014 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +This project includes code from the tslib project + +Copyright 2015 Microsoft Corporation. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +This project includes code from the jemalloc project + +https://github.com/jemalloc/jemalloc + +Copyright (C) 2002-2017 Jason Evans . +All rights reserved. +Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. +Copyright (C) 2009-2017 Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +-------------------------------------------------------------------------------- + +This project includes code from the Go project, BSD 3-clause license + PATENTS +weak patent termination clause +(https://github.com/golang/go/blob/master/PATENTS). + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This project includes code from the hs2client + +https://github.com/cloudera/hs2client + +Copyright 2016 Cloudera Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +The script ci/scripts/util_wait_for_it.sh has the following license + +Copyright (c) 2016 Giles Hall + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +The script r/configure has the following license (MIT) + +Copyright (c) 2017, Jeroen Ooms and Jim Hester + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +cpp/src/arrow/util/logging.cc, cpp/src/arrow/util/logging.h and +cpp/src/arrow/util/logging-test.cc are adapted from +Ray Project (https://github.com/ray-project/ray) (Apache 2.0). + +Copyright (c) 2016 Ray Project (https://github.com/ray-project/ray) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- +The files cpp/src/arrow/vendored/datetime/date.h, cpp/src/arrow/vendored/datetime/tz.h, +cpp/src/arrow/vendored/datetime/tz_private.h, cpp/src/arrow/vendored/datetime/ios.h, +cpp/src/arrow/vendored/datetime/ios.mm, +cpp/src/arrow/vendored/datetime/tz.cpp are adapted from +Howard Hinnant's date library (https://github.com/HowardHinnant/date) +It is licensed under MIT license. + +The MIT License (MIT) +Copyright (c) 2015, 2016, 2017 Howard Hinnant +Copyright (c) 2016 Adrian Colomitchi +Copyright (c) 2017 Florian Dang +Copyright (c) 2017 Paul Thompson +Copyright (c) 2018 Tomasz KamiƄski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +The file cpp/src/arrow/util/utf8.h includes code adapted from the page + https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ +with the following license (MIT) + +Copyright (c) 2008-2009 Bjoern Hoehrmann + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/xxhash/ have the following license +(BSD 2-Clause License) + +xxHash Library +Copyright (c) 2012-2014, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- xxHash homepage: http://www.xxhash.com +- xxHash source repository : https://github.com/Cyan4973/xxHash + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/double-conversion/ have the following license +(BSD 3-Clause License) + +Copyright 2006-2011, the V8 project authors. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/uriparser/ have the following license +(BSD 3-Clause License) + +uriparser - RFC 3986 URI parsing library + +Copyright (C) 2007, Weijia Song +Copyright (C) 2007, Sebastian Pipping +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + * Neither the name of the nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +The files under dev/tasks/conda-recipes have the following license + +BSD 3-clause license +Copyright (c) 2015-2018, conda-forge +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/utfcpp/ have the following license + +Copyright 2006-2018 Nemanja Trifunovic + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This project includes code from Apache Kudu. + + * cpp/cmake_modules/CompilerInfo.cmake is based on Kudu's cmake_modules/CompilerInfo.cmake + +Copyright: 2016 The Apache Software Foundation. +Home page: https://kudu.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This project includes code from Apache Impala (incubating), formerly +Impala. The Impala code and rights were donated to the ASF as part of the +Incubator process after the initial code imports into Apache Parquet. + +Copyright: 2012 Cloudera, Inc. +Copyright: 2016 The Apache Software Foundation. +Home page: http://impala.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This project includes code from Apache Aurora. + +* dev/release/{release,changelog,release-candidate} are based on the scripts from + Apache Aurora + +Copyright: 2016 The Apache Software Foundation. +Home page: https://aurora.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This project includes code from the Google styleguide. + +* cpp/build-support/cpplint.py is based on the scripts from the Google styleguide. + +Copyright: 2009 Google Inc. All rights reserved. +Homepage: https://github.com/google/styleguide +License: 3-clause BSD + +-------------------------------------------------------------------------------- + +This project includes code from Snappy. + +* cpp/cmake_modules/{SnappyCMakeLists.txt,SnappyConfig.h} are based on code + from Google's Snappy project. + +Copyright: 2009 Google Inc. All rights reserved. +Homepage: https://github.com/google/snappy +License: 3-clause BSD + +-------------------------------------------------------------------------------- + +This project includes code from the manylinux project. + +* python/manylinux1/scripts/{build_python.sh,python-tag-abi-tag.py, + requirements.txt} are based on code from the manylinux project. + +Copyright: 2016 manylinux +Homepage: https://github.com/pypa/manylinux +License: The MIT License (MIT) + +-------------------------------------------------------------------------------- + +This project includes code from the cymove project: + +* python/pyarrow/includes/common.pxd includes code from the cymove project + +The MIT License (MIT) +Copyright (c) 2019 Omer Ozarslan + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +The projects includes code from the Ursabot project under the dev/archery +directory. + +License: BSD 2-Clause + +Copyright 2019 RStudio, Inc. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This project include code from mingw-w64. + +* cpp/src/arrow/util/cpu-info.cc has a polyfill for mingw-w64 < 5 + +Copyright (c) 2009 - 2013 by the mingw-w64 project +Homepage: https://mingw-w64.org +License: Zope Public License (ZPL) Version 2.1. + +--------------------------------------------------------------------------------- + +This project include code from Google's Asylo project. + +* cpp/src/arrow/result.h is based on status_or.h + +Copyright (c) Copyright 2017 Asylo authors +Homepage: https://asylo.dev/ +License: Apache 2.0 + +-------------------------------------------------------------------------------- + +This project includes code from Google's protobuf project + +* cpp/src/arrow/result.h ARROW_ASSIGN_OR_RAISE is based off ASSIGN_OR_RETURN +* cpp/src/arrow/util/bit_stream_utils.h contains code from wire_format_lite.h + +Copyright 2008 Google Inc. All rights reserved. +Homepage: https://developers.google.com/protocol-buffers/ +License: + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Code generated by the Protocol Buffer compiler is owned by the owner +of the input file used when generating it. This code is not +standalone and requires a support library to be linked with it. This +support library is itself covered by the above license. + +-------------------------------------------------------------------------------- + +3rdparty dependency LLVM is statically linked in certain binary distributions. +Additionally some sections of source code have been derived from sources in LLVM +and have been clearly labeled as such. LLVM has the following license: + +============================================================================== +The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: +============================================================================== + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +---- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + +============================================================================== +Software from third parties included in the LLVM Project: +============================================================================== +The LLVM Project contains third party software which is under different license +terms. All such code will be identified clearly using at least one of two +mechanisms: +1) It will be in a separate directory tree with its own `LICENSE.txt` or + `LICENSE` file at the top containing the specific license and restrictions + which apply to that software, or +2) It will contain specific license and restriction terms at the top of every + file. + +-------------------------------------------------------------------------------- + +3rdparty dependency gRPC is statically linked in certain binary +distributions, like the python wheels. gRPC has the following license: + +Copyright 2014 gRPC authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +3rdparty dependency Apache Thrift is statically linked in certain binary +distributions, like the python wheels. Apache Thrift has the following license: + +Apache Thrift +Copyright (C) 2006 - 2019, The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +3rdparty dependency Apache ORC is statically linked in certain binary +distributions, like the python wheels. Apache ORC has the following license: + +Apache ORC +Copyright 2013-2019 The Apache Software Foundation + +This product includes software developed by The Apache Software +Foundation (http://www.apache.org/). + +This product includes software developed by Hewlett-Packard: +(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +3rdparty dependency zstd is statically linked in certain binary +distributions, like the python wheels. ZSTD has the following license: + +BSD License + +For Zstandard software + +Copyright (c) 2016-present, Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency lz4 is statically linked in certain binary +distributions, like the python wheels. lz4 has the following license: + +LZ4 Library +Copyright (c) 2011-2016, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency Brotli is statically linked in certain binary +distributions, like the python wheels. Brotli has the following license: + +Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +-------------------------------------------------------------------------------- + +3rdparty dependency rapidjson is statically linked in certain binary +distributions, like the python wheels. rapidjson and its dependencies have the +following licenses: + +Tencent is pleased to support the open source community by making RapidJSON +available. + +Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. +All rights reserved. + +If you have downloaded a copy of the RapidJSON binary from Tencent, please note +that the RapidJSON binary is licensed under the MIT License. +If you have downloaded a copy of the RapidJSON source code from Tencent, please +note that RapidJSON source code is licensed under the MIT License, except for +the third-party components listed below which are subject to different license +terms. Your integration of RapidJSON into your own projects may require +compliance with the MIT License, as well as the other licenses applicable to +the third-party components included within RapidJSON. To avoid the problematic +JSON license in your own projects, it's sufficient to exclude the +bin/jsonchecker/ directory, as it's the only code under the JSON license. +A copy of the MIT License is included in this file. + +Other dependencies and licenses: + + Open Source Software Licensed Under the BSD License: + -------------------------------------------------------------------- + + The msinttypes r29 + Copyright (c) 2006-2013 Alexander Chemeris + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + DAMAGE. + + Terms of the MIT License: + -------------------------------------------------------------------- + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +3rdparty dependency snappy is statically linked in certain binary +distributions, like the python wheels. snappy has the following license: + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Google Inc. nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +=== + +Some of the benchmark data in testdata/ is licensed differently: + + - fireworks.jpeg is Copyright 2013 Steinar H. Gunderson, and + is licensed under the Creative Commons Attribution 3.0 license + (CC-BY-3.0). See https://creativecommons.org/licenses/by/3.0/ + for more information. + + - kppkn.gtb is taken from the Gaviota chess tablebase set, and + is licensed under the MIT License. See + https://sites.google.com/site/gaviotachessengine/Home/endgame-tablebases-1 + for more information. + + - paper-100k.pdf is an excerpt (bytes 92160 to 194560) from the paper + “Combinatorial Modeling of Chromatin Features Quantitatively Predicts DNA + Replication Timing in _Drosophila_” by Federico Comoglio and Renato Paro, + which is licensed under the CC-BY license. See + http://www.ploscompbiol.org/static/license for more ifnormation. + + - alice29.txt, asyoulik.txt, plrabn12.txt and lcet10.txt are from Project + Gutenberg. The first three have expired copyrights and are in the public + domain; the latter does not have expired copyright, but is still in the + public domain according to the license information + (http://www.gutenberg.org/ebooks/53). + +-------------------------------------------------------------------------------- + +3rdparty dependency gflags is statically linked in certain binary +distributions, like the python wheels. gflags has the following license: + +Copyright (c) 2006, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency glog is statically linked in certain binary +distributions, like the python wheels. glog has the following license: + +Copyright (c) 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +A function gettimeofday in utilities.cc is based on + +http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd + +The license of this code is: + +Copyright (c) 2003-2008, Jouni Malinen and contributors +All Rights Reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name(s) of the above-listed copyright holder(s) nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency re2 is statically linked in certain binary +distributions, like the python wheels. re2 has the following license: + +Copyright (c) 2009 The RE2 Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency c-ares is statically linked in certain binary +distributions, like the python wheels. c-ares has the following license: + +# c-ares license + +Copyright (c) 2007 - 2018, Daniel Stenberg with many contributors, see AUTHORS +file. + +Copyright 1998 by the Massachusetts Institute of Technology. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, provided that +the above copyright notice appear in all copies and that both that copyright +notice and this permission notice appear in supporting documentation, and that +the name of M.I.T. not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior permission. +M.I.T. makes no representations about the suitability of this software for any +purpose. It is provided "as is" without express or implied warranty. + +-------------------------------------------------------------------------------- + +3rdparty dependency zlib is redistributed as a dynamically linked shared +library in certain binary distributions, like the python wheels. In the future +this will likely change to static linkage. zlib has the following license: + +zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.11, January 15th, 2017 + + Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + +-------------------------------------------------------------------------------- + +3rdparty dependency openssl is redistributed as a dynamically linked shared +library in certain binary distributions, like the python wheels. openssl +preceding version 3 has the following license: + + LICENSE ISSUES + ============== + + The OpenSSL toolkit stays under a double license, i.e. both the conditions of + the OpenSSL License and the original SSLeay license apply to the toolkit. + See below for the actual license texts. + + OpenSSL License + --------------- + +/* ==================================================================== + * Copyright (c) 1998-2019 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +-------------------------------------------------------------------------------- + +This project includes code from the rtools-backports project. + +* ci/scripts/PKGBUILD and ci/scripts/r_windows_build.sh are based on code + from the rtools-backports project. + +Copyright: Copyright (c) 2013 - 2019, АлДĐșсДĐč and Jeroen Ooms. +All rights reserved. +Homepage: https://github.com/r-windows/rtools-backports +License: 3-clause BSD + +-------------------------------------------------------------------------------- + +Some code from pandas has been adapted for the pyarrow codebase. pandas is +available under the 3-clause BSD license, which follows: + +pandas license +============== + +Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Copyright (c) 2008-2011 AQR Capital Management, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +Some bits from DyND, in particular aspects of the build system, have been +adapted from libdynd and dynd-python under the terms of the BSD 2-clause +license + +The BSD 2-Clause License + + Copyright (C) 2011-12, Dynamic NDArray Developers + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Dynamic NDArray Developers list: + + * Mark Wiebe + * Continuum Analytics + +-------------------------------------------------------------------------------- + +Some source code from Ibis (https://github.com/cloudera/ibis) has been adapted +for PyArrow. Ibis is released under the Apache License, Version 2.0. + +-------------------------------------------------------------------------------- + +dev/tasks/homebrew-formulae/apache-arrow.rb has the following license: + +BSD 2-Clause License + +Copyright (c) 2009-present, Homebrew contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +cpp/src/arrow/vendored/base64.cpp has the following license + +ZLIB License + +Copyright (C) 2004-2017 RenĂ© Nyffenegger + +This source code is provided 'as-is', without any express or implied +warranty. In no event will the author be held liable for any damages arising +from the use of this software. + +Permission is granted to anyone to use this software for any purpose, including +commercial applications, and to alter it and redistribute it freely, subject to +the following restrictions: + +1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + +3. This notice may not be removed or altered from any source distribution. + +RenĂ© Nyffenegger rene.nyffenegger@adp-gmbh.ch + +-------------------------------------------------------------------------------- + +This project includes code from Folly. + + * cpp/src/arrow/vendored/ProducerConsumerQueue.h + +is based on Folly's + + * folly/Portability.h + * folly/lang/Align.h + * folly/ProducerConsumerQueue.h + +Copyright: Copyright (c) Facebook, Inc. and its affiliates. +Home page: https://github.com/facebook/folly +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +The file cpp/src/arrow/vendored/musl/strptime.c has the following license + +Copyright © 2005-2020 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +The file cpp/cmake_modules/BuildUtils.cmake contains code from + +https://gist.github.com/cristianadam/ef920342939a89fae3e8a85ca9459b49 + +which is made available under the MIT license + +Copyright (c) 2019 Cristian Adam + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/portable-snippets/ contain code from + +https://github.com/nemequ/portable-snippets + +and have the following copyright notice: + +Each source file contains a preamble explaining the license situation +for that file, which takes priority over this file. With the +exception of some code pulled in from other repositories (such as +”nit, an MIT-licensed project which is used for testing), the code is +public domain, released using the CC0 1.0 Universal dedication (*). + +(*) https://creativecommons.org/publicdomain/zero/1.0/legalcode + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/fast_float/ contain code from + +https://github.com/lemire/fast_float + +which is made available under the Apache License 2.0. + +-------------------------------------------------------------------------------- + +The file python/pyarrow/vendored/docscrape.py contains code from + +https://github.com/numpy/numpydoc/ + +which is made available under the BSD 2-clause license. + +-------------------------------------------------------------------------------- + +The file python/pyarrow/vendored/version.py contains code from + +https://github.com/pypa/packaging/ + +which is made available under both the Apache license v2.0 and the +BSD 2-clause license. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/pcg contain code from + +https://github.com/imneme/pcg-cpp + +and have the following copyright notice: + +Copyright 2014-2019 Melissa O'Neill , + and the PCG Project contributors. + +SPDX-License-Identifier: (Apache-2.0 OR MIT) + +Licensed under the Apache License, Version 2.0 (provided in +LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) +or under the MIT license (provided in LICENSE-MIT.txt and at +http://opensource.org/licenses/MIT), at your option. This file may not +be copied, modified, or distributed except according to those terms. + +Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either +express or implied. See your chosen license for details. + +-------------------------------------------------------------------------------- +r/R/dplyr-count-tally.R (some portions) + +Some portions of this file are derived from code from + +https://github.com/tidyverse/dplyr/ + +which is made available under the MIT license + +Copyright (c) 2013-2019 RStudio and others. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the “Software”), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +The file src/arrow/util/io_util.cc contains code from the CPython project +which is made available under the Python Software Foundation License Version 2. + +-------------------------------------------------------------------------------- + +3rdparty dependency opentelemetry-cpp is statically linked in certain binary +distributions. opentelemetry-cpp is made available under the Apache License 2.0. + +Copyright The OpenTelemetry Authors +SPDX-License-Identifier: Apache-2.0 + +-------------------------------------------------------------------------------- + +ci/conan/ is based on code from Conan Package and Dependency Manager. + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +3rdparty dependency UCX is redistributed as a dynamically linked shared +library in certain binary distributions. UCX has the following license: + +Copyright (c) 2014-2015 UT-Battelle, LLC. All rights reserved. +Copyright (C) 2014-2020 Mellanox Technologies Ltd. All rights reserved. +Copyright (C) 2014-2015 The University of Houston System. All rights reserved. +Copyright (C) 2015 The University of Tennessee and The University + of Tennessee Research Foundation. All rights reserved. +Copyright (C) 2016-2020 ARM Ltd. All rights reserved. +Copyright (c) 2016 Los Alamos National Security, LLC. All rights reserved. +Copyright (C) 2016-2020 Advanced Micro Devices, Inc. All rights reserved. +Copyright (C) 2019 UChicago Argonne, LLC. All rights reserved. +Copyright (c) 2018-2020 NVIDIA CORPORATION. All rights reserved. +Copyright (C) 2020 Huawei Technologies Co., Ltd. All rights reserved. +Copyright (C) 2016-2020 Stony Brook University. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +The file dev/tasks/r/github.packages.yml contains code from + +https://github.com/ursa-labs/arrow-r-nightly + +which is made available under the Apache License 2.0. + +-------------------------------------------------------------------------------- +.github/actions/sync-nightlies/action.yml (some portions) + +Some portions of this file are derived from code from + +https://github.com/JoshPiper/rsync-docker + +which is made available under the MIT license + +Copyright (c) 2020 Joshua Piper + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- +.github/actions/sync-nightlies/action.yml (some portions) + +Some portions of this file are derived from code from + +https://github.com/burnett01/rsync-deployments + +which is made available under the MIT license + +Copyright (c) 2019-2022 Contention +Copyright (c) 2019-2022 Burnett01 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java + +These file are derived from code from Netty, which is made available under the +Apache License 2.0. diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/METADATA b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..0dd8c84e166b4aea3776b19fdd964d4a89ded45b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/METADATA @@ -0,0 +1,86 @@ +Metadata-Version: 2.1 +Name: pyarrow +Version: 18.0.0 +Summary: Python library for Apache Arrow +Maintainer-email: Apache Arrow Developers +License: Apache Software License +Project-URL: Homepage, https://arrow.apache.org/ +Project-URL: Documentation, https://arrow.apache.org/docs/python +Project-URL: Repository, https://github.com/apache/arrow +Project-URL: Issues, https://github.com/apache/arrow/issues +Classifier: License :: OSI Approved :: Apache Software License +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 +Requires-Python: >=3.9 +Description-Content-Type: text/markdown +License-File: ../LICENSE.txt +License-File: ../NOTICE.txt +Provides-Extra: test +Requires-Dist: pytest ; extra == 'test' +Requires-Dist: hypothesis ; extra == 'test' +Requires-Dist: cffi ; extra == 'test' +Requires-Dist: pytz ; extra == 'test' +Requires-Dist: pandas ; extra == 'test' + + + +## Python library for Apache Arrow + +[![pypi](https://img.shields.io/pypi/v/pyarrow.svg)](https://pypi.org/project/pyarrow/) [![conda-forge](https://img.shields.io/conda/vn/conda-forge/pyarrow.svg)](https://anaconda.org/conda-forge/pyarrow) + +This library provides a Python API for functionality provided by the Arrow C++ +libraries, along with tools for Arrow integration and interoperability with +pandas, NumPy, and other software in the Python ecosystem. + +## Installing + +Across platforms, you can install a recent version of pyarrow with the conda +package manager: + +```shell +conda install pyarrow -c conda-forge +``` + +On Linux, macOS, and Windows, you can also install binary wheels from PyPI with +pip: + +```shell +pip install pyarrow +``` + +If you encounter any issues importing the pip wheels on Windows, you may need +to install the [Visual C++ Redistributable for Visual Studio 2015][6]. + +## Development + +See [Python Development][2] in the documentation subproject. + +### Building the documentation + +See [documentation build instructions][1] in the documentation subproject. + +[1]: https://github.com/apache/arrow/blob/main/docs/source/developers/documentation.rst +[2]: https://github.com/apache/arrow/blob/main/docs/source/developers/python.rst +[3]: https://github.com/pandas-dev/pandas +[5]: https://arrow.apache.org/docs/latest/python/benchmarks.html +[6]: https://www.microsoft.com/en-us/download/details.aspx?id=48145 diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/NOTICE.txt b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/NOTICE.txt new file mode 100644 index 0000000000000000000000000000000000000000..2089c6fb20358ccf5e3a58ea27a234555b923f6b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/NOTICE.txt @@ -0,0 +1,84 @@ +Apache Arrow +Copyright 2016-2024 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +This product includes software from the SFrame project (BSD, 3-clause). +* Copyright (C) 2015 Dato, Inc. +* Copyright (c) 2009 Carnegie Mellon University. + +This product includes software from the Feather project (Apache 2.0) +https://github.com/wesm/feather + +This product includes software from the DyND project (BSD 2-clause) +https://github.com/libdynd + +This product includes software from the LLVM project + * distributed under the University of Illinois Open Source + +This product includes software from the google-lint project + * Copyright (c) 2009 Google Inc. All rights reserved. + +This product includes software from the mman-win32 project + * Copyright https://code.google.com/p/mman-win32/ + * Licensed under the MIT License; + +This product includes software from the LevelDB project + * Copyright (c) 2011 The LevelDB Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * Moved from Kudu http://github.com/cloudera/kudu + +This product includes software from the CMake project + * Copyright 2001-2009 Kitware, Inc. + * Copyright 2012-2014 Continuum Analytics, Inc. + * All rights reserved. + +This product includes software from https://github.com/matthew-brett/multibuild (BSD 2-clause) + * Copyright (c) 2013-2016, Matt Terry and Matthew Brett; all rights reserved. + +This product includes software from the Ibis project (Apache 2.0) + * Copyright (c) 2015 Cloudera, Inc. + * https://github.com/cloudera/ibis + +This product includes software from Dremio (Apache 2.0) + * Copyright (C) 2017-2018 Dremio Corporation + * https://github.com/dremio/dremio-oss + +This product includes software from Google Guava (Apache 2.0) + * Copyright (C) 2007 The Guava Authors + * https://github.com/google/guava + +This product include software from CMake (BSD 3-Clause) + * CMake - Cross Platform Makefile Generator + * Copyright 2000-2019 Kitware, Inc. and Contributors + +The web site includes files generated by Jekyll. + +-------------------------------------------------------------------------------- + +This product includes code from Apache Kudu, which includes the following in +its NOTICE file: + + Apache Kudu + Copyright 2016 The Apache Software Foundation + + This product includes software developed at + The Apache Software Foundation (http://www.apache.org/). + + Portions of this software were developed at + Cloudera, Inc (http://www.cloudera.com/). + +-------------------------------------------------------------------------------- + +This product includes code from Apache ORC, which includes the following in +its NOTICE file: + + Apache ORC + Copyright 2013-2019 The Apache Software Foundation + + This product includes software developed by The Apache Software + Foundation (http://www.apache.org/). + + This product includes software developed by Hewlett-Packard: + (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/RECORD b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..ca76ee2b9c31b4cc1813d5eee6bc3c19b8918429 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/RECORD @@ -0,0 +1,869 @@ +pyarrow-18.0.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +pyarrow-18.0.0.dist-info/LICENSE.txt,sha256=Ip2-KeThNE6VFy9vOkJ37A2lx4UMsDiXxH86JLevzgg,110423 +pyarrow-18.0.0.dist-info/METADATA,sha256=3Nfu3fXswkP1YwG7Zr3dEi01IQJpkqTzHRbwDJiHAto,3292 +pyarrow-18.0.0.dist-info/NOTICE.txt,sha256=ti6iQmQtOhjx4psMH-CCQVppQ_4VjuIrSM_zdi81QAk,3032 +pyarrow-18.0.0.dist-info/RECORD,, +pyarrow-18.0.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pyarrow-18.0.0.dist-info/WHEEL,sha256=GAEhkxm77H2WVEyE70oaJna7cr1z-ZyZC29APwfwcpg,113 +pyarrow-18.0.0.dist-info/top_level.txt,sha256=Zuk_c1WeinXdMz20fXlEtGC67zfKOWuwU8adpEEU_nI,18 +pyarrow/__init__.pxd,sha256=Wnar1phFqM_ZHnZmtbuqm6wJHsXlBoYKhV7Qmo2jUHA,2195 +pyarrow/__init__.py,sha256=KdUkzmN0NJf1AKTskHDoH3OwZoc0Ih2PRu3BqtdjHbY,18119 +pyarrow/__pycache__/__init__.cpython-312.pyc,, +pyarrow/__pycache__/_compute_docstrings.cpython-312.pyc,, +pyarrow/__pycache__/_generated_version.cpython-312.pyc,, +pyarrow/__pycache__/acero.cpython-312.pyc,, +pyarrow/__pycache__/benchmark.cpython-312.pyc,, +pyarrow/__pycache__/cffi.cpython-312.pyc,, +pyarrow/__pycache__/compute.cpython-312.pyc,, +pyarrow/__pycache__/conftest.cpython-312.pyc,, +pyarrow/__pycache__/csv.cpython-312.pyc,, +pyarrow/__pycache__/cuda.cpython-312.pyc,, +pyarrow/__pycache__/dataset.cpython-312.pyc,, +pyarrow/__pycache__/feather.cpython-312.pyc,, +pyarrow/__pycache__/flight.cpython-312.pyc,, +pyarrow/__pycache__/fs.cpython-312.pyc,, +pyarrow/__pycache__/ipc.cpython-312.pyc,, +pyarrow/__pycache__/json.cpython-312.pyc,, +pyarrow/__pycache__/jvm.cpython-312.pyc,, +pyarrow/__pycache__/orc.cpython-312.pyc,, +pyarrow/__pycache__/pandas_compat.cpython-312.pyc,, +pyarrow/__pycache__/substrait.cpython-312.pyc,, +pyarrow/__pycache__/types.cpython-312.pyc,, +pyarrow/__pycache__/util.cpython-312.pyc,, +pyarrow/_acero.cpython-312-x86_64-linux-gnu.so,sha256=CzO1lbcGNSipSDtXVTDpF1uh_dwJNksr0uHgCMablD4,325656 +pyarrow/_acero.pxd,sha256=5ish_GgGWvit4ebhzoZil7b-m0r2RuG5JwYoxsH34FI,1440 +pyarrow/_acero.pyx,sha256=56orFsG2ksoP4C0DPIa-ruQxQRCC489lYlHkGJIh1zY,21301 +pyarrow/_azurefs.cpython-312-x86_64-linux-gnu.so,sha256=j0mX0ZVWYrLyG2z1XiFiSKJHZEUjMk7F_sYBzeiC46o,100736 +pyarrow/_azurefs.pyx,sha256=ezGU_z3kIw2r14YAB4yogaxMssJXikcxajz4dZSez1k,5909 +pyarrow/_compute.cpython-312-x86_64-linux-gnu.so,sha256=-75kXqawlzHSB8wIRB-3wqe0N22QnQHlmr9zb8r2_A0,1367552 +pyarrow/_compute.pxd,sha256=nmjgwV2KCGFfxZj5ruDwM4oH1ITqF0rDQS0yDvcaXBA,1949 +pyarrow/_compute.pyx,sha256=dmGXzWOdzMfrj-nWV8RWBRypswmtKZSEwRZ64KZsuuY,107233 +pyarrow/_compute_docstrings.py,sha256=7Vg8jt1aCsWrpTxsdqR7gY6M0faxXNX31c1RZdq9CFw,1707 +pyarrow/_csv.cpython-312-x86_64-linux-gnu.so,sha256=kvIlEcQZF8mZC8bFbLB79BEQo3EA64DSa3Rmhtot9EA,370136 +pyarrow/_csv.pxd,sha256=1Zk3Zpvvhy-Tb7c79Aqd4e7bBM21kc1JxWJkl02Y4DE,1638 +pyarrow/_csv.pyx,sha256=aqWLHgfZ-nrqKHfVVVXeGbtY2ZChE6Oi9qIa5jP4m1M,54705 +pyarrow/_cuda.pxd,sha256=VzhM6j9dpNgrABlvFJKoMpRC0As55im-M3tgPTOuwEk,1922 +pyarrow/_cuda.pyx,sha256=YSlswn4Tj1H24SL_iPIGqT3v3JmofE7NwCunuOLMNwY,35136 +pyarrow/_dataset.cpython-312-x86_64-linux-gnu.so,sha256=TINV01BLRr-V8CU0rIxhvsXmxJYIb22OmFtW9TsFeyQ,1069560 +pyarrow/_dataset.pxd,sha256=Ag9rUhoBySU6ba3wFLeuZyWMJnz9VkAf9TQEzWG4hUU,4944 +pyarrow/_dataset.pyx,sha256=WGqJ4LCtbpxkU4bdIjLR0MAxE1WgmrsCvSQlr1M-emc,157400 +pyarrow/_dataset_orc.cpython-312-x86_64-linux-gnu.so,sha256=Ncpe1WqEvmFMLpdn9_M7BQizHOySb9gnO_YjDLNQBB4,78528 +pyarrow/_dataset_orc.pyx,sha256=JSFoRI0pfHtL2jeIuPg5TJHodcfuCNYmj_iEZ4xY87w,1499 +pyarrow/_dataset_parquet.cpython-312-x86_64-linux-gnu.so,sha256=rdGHm9V-aqQbylDCfS_XcGqTIVk9FIUm5PRtAsrr5PM,363208 +pyarrow/_dataset_parquet.pxd,sha256=y-3iKehyB_eB_oeqjtt4aQRbUpVGVN1oUMFGIY13brE,1572 +pyarrow/_dataset_parquet.pyx,sha256=x_VvTpuF9vxfAAUGkLZrr0N_TCxYWlqMahlkg0201gQ,38916 +pyarrow/_dataset_parquet_encryption.cpython-312-x86_64-linux-gnu.so,sha256=nU3etFWPa7m5iWcDiDB-nTLYlAHPVvmHfktNUZ3dsTc,120792 +pyarrow/_dataset_parquet_encryption.pyx,sha256=p7LDNUsp3jMVcWDcbOFp8a3CYJjASVPI_tfATpY-ePg,7229 +pyarrow/_dlpack.pxi,sha256=clw0FkGoyZMEtUU8zPpO_DMtl2X-27kb2UtyhQuIc1s,1832 +pyarrow/_feather.cpython-312-x86_64-linux-gnu.so,sha256=2xQDtVCtdyT3LQlx0Z9H3H_5D-RQzIshL4UzkurgHWg,114048 +pyarrow/_feather.pyx,sha256=DWQI4U0uAWE1ZYUwPreBPJg1TGLEGmF3wPEIRL-PhPw,3773 +pyarrow/_flight.cpython-312-x86_64-linux-gnu.so,sha256=A7HTbAn9DizK0fnyqsxFnTnCfB1eZycu_q-DW4rYuGg,1295936 +pyarrow/_flight.pyx,sha256=siQR9YhOPLToP5dnHtkm-aCjgPfBLYq8d777RHY_MsY,110592 +pyarrow/_fs.cpython-312-x86_64-linux-gnu.so,sha256=hueNcOY_-DCLIUHmNjDm19pT9akEm1fPGO_drOdg9UM,494360 +pyarrow/_fs.pxd,sha256=SmHS31eyYU7VUZlVuP613HKgpd7bENnQGApvX_g2Lfw,2439 +pyarrow/_fs.pyx,sha256=ZyfvmWary8XTHaCoeSYtjI-b0SK0lCsznerI8cGh4K8,52479 +pyarrow/_gcsfs.cpython-312-x86_64-linux-gnu.so,sha256=oddiMszazY06J7lU5UzkrXwqQ3XEtkCMwqLp8qcf-Rs,125624 +pyarrow/_gcsfs.pyx,sha256=fa1QmTQII9sFKtjtfeZPQfTEntAh3IGyJJ1w116OCA4,9121 +pyarrow/_generated_version.py,sha256=i_fCpTx5i1I-UJ-nlF2AibzMnw39cYz34cp6LRgZ6Lw,413 +pyarrow/_hdfs.cpython-312-x86_64-linux-gnu.so,sha256=g1QSx1UkHEy2cQPAWp6lPBBpkU867IzM_iQbK5FIvt8,126368 +pyarrow/_hdfs.pyx,sha256=HA0KkZa6aVRmwg3ku3U7lZo_8COn1cLwylfc6nEJUlg,5885 +pyarrow/_json.cpython-312-x86_64-linux-gnu.so,sha256=2SlAopzzBqY_QokjiQhY8-lOfi09hEuvh9iTAZhTnv4,112216 +pyarrow/_json.pxd,sha256=tECTP14M12-b_ja5QI3snQbd0uWPWmmC9FwkWq23Vg0,1206 +pyarrow/_json.pyx,sha256=RmaWaSTG61u2Qcmc_fLsTns_awLJDls3_SdlaCAg53Y,9860 +pyarrow/_orc.cpython-312-x86_64-linux-gnu.so,sha256=ewqPpAK5I12Kyr8lXgudODy2wEJmZXDtQ5pY6SbnREI,206480 +pyarrow/_orc.pxd,sha256=6hL0cq1RufqQD-B_bV3ne1rhu2g-h4rDOFNQsSb6qps,5689 +pyarrow/_orc.pyx,sha256=Pn7r4dzagWaqMf8rymbXBIWisxonBaStZgXCi7pfrZI,15556 +pyarrow/_parquet.cpython-312-x86_64-linux-gnu.so,sha256=2QyN46ON68iLUE6E9jnxlHQ5IL_aMymi15e1Pg1T0-o,608296 +pyarrow/_parquet.pxd,sha256=bLdSgSZg1bn-ZsQbAKOWH44esooFHQOiO9wqmrQuvn4,26805 +pyarrow/_parquet.pyx,sha256=xpIBXt3xv9S1H9G6CqSpwl_ZokhJpy09yShxuntqMMg,74388 +pyarrow/_parquet_encryption.cpython-312-x86_64-linux-gnu.so,sha256=QxlbaWRzFgFy_Y534VMGc7R4BGnCn6X8LPd25C3FZAg,275008 +pyarrow/_parquet_encryption.pxd,sha256=1vQnkyS1rLrSNMlmuW62PxkOmCsYpzC60L9mqD0_LYc,2586 +pyarrow/_parquet_encryption.pyx,sha256=CaTiq5EjTVGYQnxDEmpYcItSBiEencV-pNEu-lBAiOk,18625 +pyarrow/_pyarrow_cpp_tests.cpython-312-x86_64-linux-gnu.so,sha256=Qe9kRjHecoR0c7V5AAHE-ror_WA9uj2mObVD8UTXq7s,87984 +pyarrow/_pyarrow_cpp_tests.pxd,sha256=nPyRmNtFbOUvSXCwegAApQFfh8UI_K9Hq5dN4oPAxdo,1199 +pyarrow/_pyarrow_cpp_tests.pyx,sha256=gLeMzB9RWodZgXEpipX65_0aqWu12SjMld0JZmZVRP0,1753 +pyarrow/_s3fs.cpython-312-x86_64-linux-gnu.so,sha256=OTlYyfsqxBcc8i2u-XUq1VPCLBs783bA7TkcDZytgos,223392 +pyarrow/_s3fs.pyx,sha256=VFuZkBV8rt44JrYtAwbPxGI1YlZJ9gfl1U91JQgJEMU,19706 +pyarrow/_substrait.cpython-312-x86_64-linux-gnu.so,sha256=Kn-zfj46RpmeRweki24CjN84RLBk3C9fVLWtvIbqBKo,187176 +pyarrow/_substrait.pyx,sha256=CA2kxzxJUVPL7lMn8_XSAa9jt1Alq4IbhcI3sHGvsxw,11630 +pyarrow/acero.py,sha256=_P7DcFTmhgW4-EAyM67luFMWcp4t1iUX1pKBIkVe7cM,15108 +pyarrow/array.pxi,sha256=jP6v7Y3YUVggdyKmfNAa7hGNd2T-1xONCTxK-2a-IOI,151046 +pyarrow/benchmark.pxi,sha256=DYXdu-jMSH7XcTohbc8x8NiKRLtpX9IULfY20ohkffA,869 +pyarrow/benchmark.py,sha256=k9Z3yQyoojpYz4lTA6DkCfqT6fPG3N2fJtsHKjpbYFo,856 +pyarrow/builder.pxi,sha256=9QE4KAiA4JpA7-2JLgX3xo32jRtuWZ3YqC-T9GzUVDc,4634 +pyarrow/cffi.py,sha256=hEcrPH9KeG6NES3ZCpSbOVYhOgDOuBB_2LgMMucgw-8,2396 +pyarrow/compat.pxi,sha256=Sq5c3CKq0uj5aDyOoHHkPEO_VsSpZ90JRaL2rAKHk5I,1920 +pyarrow/compute.py,sha256=MyrZk7PTX-8pYlUu5PLbuXjDMTRpyCcgdFWi2BPVK0I,23181 +pyarrow/config.pxi,sha256=E6QOFjdlw3H1a5BOAevYNJJEmmm6FblfaaeyspnWBWw,3092 +pyarrow/conftest.py,sha256=afosSyVsVRsJdDXRXOFQEyj4qVO39OtZYVb_wbvvadU,9811 +pyarrow/csv.py,sha256=S6tm31Bra9HPf9IsYwBLltZBLMvNzypWfeCLySsjmds,974 +pyarrow/cuda.py,sha256=j--8HcBAm5Ib-kbhK4d2M6SVQmDWkr7Mt5fnwU2LzdQ,1087 +pyarrow/dataset.py,sha256=4ibGh9x36jEYI7VMxTdZc-XDg8VfNx6RPbA2L-bLJbA,40232 +pyarrow/device.pxi,sha256=CtVBXp68zNXrrPwehh56igfLsMSlYRo5rWFcKkEP_gY,5569 +pyarrow/error.pxi,sha256=Wj7-NGUfdvlEwAwd8Ta_JqRC8IUOUpm_PmpvizCFvfY,8909 +pyarrow/feather.py,sha256=9rWL-TYK_qc0FW3vIyYyd6Xt86ApJWLqo-2cK3F5vGQ,9959 +pyarrow/flight.py,sha256=HLB04A0SZ35MZJumPIuBu5I2dpetjEc-CGMEdjQeQRQ,2177 +pyarrow/fs.py,sha256=M-cSbS2bBR4MwbJqpz9Q7VxHY8fa89StEw2J0XMMF7E,14899 +pyarrow/gandiva.pyx,sha256=bF23rkq6e45i-CePDZeTy9iFwoeg8ElrNjz9VK97QRs,24503 +pyarrow/include/arrow/acero/accumulation_queue.h,sha256=_HoTuKEkZodmrwXF9CeWGsmpT7jIM0FrrYZSPMTMMr8,5856 +pyarrow/include/arrow/acero/aggregate_node.h,sha256=9HdFxR6tzSfx_UaUHZtS1I2FCbm3PvfF8FdekVpBO34,2155 +pyarrow/include/arrow/acero/api.h,sha256=fRuKEHbKDYWRCwSHLc7vSD-6mQavyOsztluCR7evFCk,1151 +pyarrow/include/arrow/acero/asof_join_node.h,sha256=Ko6r1wDjxg01FE9-xKkttx7WzCAzf43GxbpvGHgKZp8,1490 +pyarrow/include/arrow/acero/backpressure_handler.h,sha256=CsSWRenrtbZYiNnf-cdYCgMLmu5KUAPUKNKMDWttoD4,2810 +pyarrow/include/arrow/acero/benchmark_util.h,sha256=T5bNabF1TDAp28S7V_vt_VIDn6l5Be0zOVCHhcTcFf8,1943 +pyarrow/include/arrow/acero/bloom_filter.h,sha256=bFzzAzQrs9ePp2tCPQIuk1Oa9gG_Nyp72M_HM0dhakM,11978 +pyarrow/include/arrow/acero/exec_plan.h,sha256=U0KA3tnNvVb75G0XQFLVbGzXCGdddGyRhW3zMa8oWJc,35909 +pyarrow/include/arrow/acero/hash_join.h,sha256=Ji0k5z778QtNQ0MwU6xBP6z7ajLk79Va-vgCqrlApso,3003 +pyarrow/include/arrow/acero/hash_join_dict.h,sha256=_BKJmK3Z_KdJuYHh4KQCuT_1rXlUohrtEgGLtEJ4fgQ,15360 +pyarrow/include/arrow/acero/hash_join_node.h,sha256=FXT-aeXL7nNTuV75f9oXgdGyqMK_72GnqGUm9cmBnko,4378 +pyarrow/include/arrow/acero/map_node.h,sha256=Bd1HcW0N5azoIVth2ATeHxgTKd9XmmEkz42YBNw5eK0,2628 +pyarrow/include/arrow/acero/options.h,sha256=r-GnLElNJAAdFoJ7k0Q1TOfvGSOdgT9BrWbdMcS_SF0,37262 +pyarrow/include/arrow/acero/order_by_impl.h,sha256=dQqplP-AZWPZRKio8LmTjYWlCYz9VmW-usUrtaLpd_w,1691 +pyarrow/include/arrow/acero/partition_util.h,sha256=bs_zxok-qng8jsHmVBlfJ7Ts2uBEmovEb27knqQmT-Q,7411 +pyarrow/include/arrow/acero/pch.h,sha256=8VXXI10rUHzlQiAthx-yjHMQCpGL3dgAiVaGzTubPPE,1094 +pyarrow/include/arrow/acero/query_context.h,sha256=D364aGRS3uWe8lgYqCNRjVvs5sKetLOOXzACdp5GZeg,6212 +pyarrow/include/arrow/acero/schema_util.h,sha256=KA_hV2xy2TRccMyksSzQrdH9_rdGo3tQyHOIvrWWYBQ,7961 +pyarrow/include/arrow/acero/task_util.h,sha256=6pqILuYfcVwt9HqVhRfXFVJoOC-Q_dtk8mQ5SxjgwbY,3706 +pyarrow/include/arrow/acero/test_nodes.h,sha256=xKeLWZZC8iokveVXPjseO1MOvWMcby-0xiMISy0qw8E,2877 +pyarrow/include/arrow/acero/time_series_util.h,sha256=W9yzoaTGkB2jtYm8w2CYknSw1EjMbsdTfmEuuL2zMtk,1210 +pyarrow/include/arrow/acero/tpch_node.h,sha256=l3zocxHTfGmXTjywJxwoXCIk9tjzURgWdYKSgSk8DAQ,2671 +pyarrow/include/arrow/acero/type_fwd.h,sha256=4zLhtLJf_7MSXgrhQIZVGeLxjT7JrEDAn9yW75DTFlc,1103 +pyarrow/include/arrow/acero/util.h,sha256=byhMEj5XoAUy-93AjLrx_p9_iUZdYn5uJ_cDkCJQt5Q,6121 +pyarrow/include/arrow/acero/visibility.h,sha256=E-4G2O4F2YabXnFNJYnsI2VbVoKBtO7AXqh_SPuJi6k,1616 +pyarrow/include/arrow/adapters/orc/adapter.h,sha256=G5SSGGYMSREILC43kqL5fqo94c4tKgukitO15m217tY,11031 +pyarrow/include/arrow/adapters/orc/options.h,sha256=FMxda5YSskRrB6h9FvcAuMxl5qdavWrNYHPlanjtk48,3696 +pyarrow/include/arrow/adapters/tensorflow/convert.h,sha256=ZGFAodnwTJK0ZoXfgYJdjgi_F4vfEhI9E87zejxVb6E,3465 +pyarrow/include/arrow/api.h,sha256=Gs6HiRBYU5N7-a79hjTl9WMSda551XdUKpWthFY2v1s,2491 +pyarrow/include/arrow/array.h,sha256=P5oW6hvD2j97bLaSTE4_UHuV6Y38DTwJVww3Eb3xdTQ,1981 +pyarrow/include/arrow/array/array_base.h,sha256=14RULo7wEJze9IY2psySGtBlBsnCErnqY4lBO4ckU6g,12123 +pyarrow/include/arrow/array/array_binary.h,sha256=JvtB8DoR0_tqfSFS_9nMRrJ39lt1cTm5yXh-DLkhqjU,11247 +pyarrow/include/arrow/array/array_decimal.h,sha256=xRfrZ1IFO09EmkHEolCwrJ4lsXjLo5DXdfH5_v2gSyw,3105 +pyarrow/include/arrow/array/array_dict.h,sha256=6AMbSnZoMj-nhQhZhG4RNnxy9VVPk2DvZjVblwIUhgY,7611 +pyarrow/include/arrow/array/array_nested.h,sha256=xySiF5b1ab97GifKMx6FuYZWb2_6e3YvSMfOORGe3J4,37605 +pyarrow/include/arrow/array/array_primitive.h,sha256=anek7WkjubNBTRz8wOHyZ0_UuE3BExj02P-PCs3F5To,7719 +pyarrow/include/arrow/array/array_run_end.h,sha256=4zs3tcUrIgDOhSEOywJ1vGY2lsH-5QuEBn87mxnDbi8,5101 +pyarrow/include/arrow/array/builder_adaptive.h,sha256=92DpiIZDXSI_yOrMftj7P60zlCLjNmwfGM5ubdbXWM4,6861 +pyarrow/include/arrow/array/builder_base.h,sha256=CP9kS8pDFd4XyJQdgIlBp3pTIX9mND1Lvh85re4IC8w,13723 +pyarrow/include/arrow/array/builder_binary.h,sha256=01BrSwkFQNAEy4FVYi8Esbd2CaeyxN04GDUoXsQUFhU,32718 +pyarrow/include/arrow/array/builder_decimal.h,sha256=DFxyFlpzWRZS9zdBhsjII5fFUOMY9bXHn3EIrIvmOMo,5051 +pyarrow/include/arrow/array/builder_dict.h,sha256=FZjvCRIDmVuwmzx_HCcDK6ZjNoZKCEsSV-fGI0K974Y,27899 +pyarrow/include/arrow/array/builder_nested.h,sha256=1In_M8pjkaqTuvNZlbGXWGID33CW2eBhy4in0oZsavA,31231 +pyarrow/include/arrow/array/builder_primitive.h,sha256=OOfGI-zDM7BMWIBv-Tko_8pJDkpw-ttQM76JldlUOvc,20808 +pyarrow/include/arrow/array/builder_run_end.h,sha256=SZIdsUKK1qAc9pdonPGf0A_aikZHcxxzicezRGR5hLs,11416 +pyarrow/include/arrow/array/builder_time.h,sha256=8M2ifZnDgujSItXKsevyBdtM6Iky3ImyeIdAqZV3fec,2548 +pyarrow/include/arrow/array/builder_union.h,sha256=8BF532sAMc7JxWIbSN-yX6Z9fqY9jmmsIa054DPvbWE,10144 +pyarrow/include/arrow/array/concatenate.h,sha256=wBy-CBTz9MeRCmcnfXGvkXnvSRApvPOcfCf64A42ys8,2059 +pyarrow/include/arrow/array/data.h,sha256=BuYmkq11BUas2FvufTRZkg_aoWVd-rLX1sBQIwB5HuE,25147 +pyarrow/include/arrow/array/diff.h,sha256=bYNKy2oLAxtt6VYDWvCfq2bnJTVNjG5KMTsGl-gT_kM,3344 +pyarrow/include/arrow/array/statistics.h,sha256=JYPb5hAHmJTQ9cDHcEhhHGRBZurt6CcVbUOlp54UWSU,2498 +pyarrow/include/arrow/array/util.h,sha256=qVHvCaVlALz8WJwAjyMwsBm5J2iN89CSgj7NpmmqlkI,3652 +pyarrow/include/arrow/array/validate.h,sha256=JdDb3XJg4TmAfpv_zgu2ITfL2H9no10TQit-HPj9Myw,1710 +pyarrow/include/arrow/buffer.h,sha256=EfXDyFegRdva4rv4nf0jtErnIrt9_FWoXSHk6OPk_G8,23092 +pyarrow/include/arrow/buffer_builder.h,sha256=tXWILwHW0MKpve7NIU2ElElPY0y0ooISa82Dq6UdhVU,17371 +pyarrow/include/arrow/builder.h,sha256=mBxMko271lJ7Xbku0hCixj943Yx-d2i4Q5Hm2WfwiGM,1546 +pyarrow/include/arrow/c/abi.h,sha256=ZohWkqHoTBeIIGYs2iv2VLL8I4G5lP8MAWgbtpWKLVM,7917 +pyarrow/include/arrow/c/bridge.h,sha256=D9W-vKI_Ko6_INcMAdUx15foV08UbBvL48R8RRcL5cM,18132 +pyarrow/include/arrow/c/dlpack.h,sha256=_HIa9AKR2mwbhf1aChIpMF_XDpFrPaf58Lt3fVxWRWc,1817 +pyarrow/include/arrow/c/dlpack_abi.h,sha256=mjp9WWq8qv6gkGirT4y0o3BL_ZI9VyHQpJ5aEpPFetI,9920 +pyarrow/include/arrow/c/helpers.h,sha256=f0Q519PwoliFHpxsHp-QvbP6fpVMN2Ha35Tk-RBK6Ws,6279 +pyarrow/include/arrow/chunk_resolver.h,sha256=b9JpynFnTfkT4zEPBhBMMcMtQmqgGyNEOLrV7qgOv-c,12368 +pyarrow/include/arrow/chunked_array.h,sha256=rKM1xrqlrGKqQ-5_cxSsENaPRO0Ct3GjbeNq4bbRKx4,10657 +pyarrow/include/arrow/compare.h,sha256=U5craXnXACCUzQ8HmGYyhTehNrOezcVUP1ABAlxI62E,5555 +pyarrow/include/arrow/compute/api.h,sha256=IQKXz_6YBBfHKOkuqkXIh9ZTZYyVgq7aEBTIzMkZEiI,2071 +pyarrow/include/arrow/compute/api_aggregate.h,sha256=cgXomjDDHoAK_ddzyH1NSqWAewzEYPD7qJBj4x5Rkhk,17173 +pyarrow/include/arrow/compute/api_scalar.h,sha256=xtRsJg11WgE5RXV9gZZHfhlEElLEpWUUWnbZXTKw4j8,66540 +pyarrow/include/arrow/compute/api_vector.h,sha256=6jxDvg_Zz14_63SfVlWnfUff135kls1aGGK_d9h3bj8,29122 +pyarrow/include/arrow/compute/cast.h,sha256=Xw9j03AIAMU_hZiqk9d2ZD4xTmESkfXaDsuZkiTypLs,4245 +pyarrow/include/arrow/compute/exec.h,sha256=0ZAA9_tzcQEr364sjJ3SwgTtURTwtCjRLzo_LOdn960,17969 +pyarrow/include/arrow/compute/expression.h,sha256=llX_81uUIyJ8vPmP8-2mAippyw4cVNhCGfqHRY37FOM,11184 +pyarrow/include/arrow/compute/function.h,sha256=krTXaLowvT1cKhecs70urPQcx74vQCJ4jswtBE4Xs5A,16345 +pyarrow/include/arrow/compute/function_options.h,sha256=Q9rjkXPrU9-Xi64_fMLPbBbW_byhjJFsvHppP1CumdA,3088 +pyarrow/include/arrow/compute/kernel.h,sha256=ywsxF87w2eI4li8be7Wiua5bXp0NYhMb7LS8IzPFO3U,31406 +pyarrow/include/arrow/compute/ordering.h,sha256=8Vw3VzDi1mGgVwKGQZakz9TVj0A40wxcL13EvuqNVjU,4129 +pyarrow/include/arrow/compute/registry.h,sha256=x7LHiaNEVvZ0VUssZFsasB52Z1AxRflkdI5tR1hhzqc,4837 +pyarrow/include/arrow/compute/row/grouper.h,sha256=m-XUADUbpC2wSYmea8rFMbooh0gJQtdTBoF81ywhhjY,7319 +pyarrow/include/arrow/compute/type_fwd.h,sha256=-O63QUbsxWws8TBi55x6u9FweUSSOOfizhE4pTczLd4,1537 +pyarrow/include/arrow/compute/util.h,sha256=eF_BX2aftTa3qUJwaZA3QGTajrDv4nf6HKXs6dOmjug,8863 +pyarrow/include/arrow/config.h,sha256=8liyKI0CJO0G-Fz5I--QjIAwh0m4hosfyAOwvVVs0sU,3044 +pyarrow/include/arrow/csv/api.h,sha256=LbwWhPyIsi_73hvsSr77RNR9uUxrVyXM__hp7QcSom0,907 +pyarrow/include/arrow/csv/chunker.h,sha256=nTs8hdy4D3Nz3oZWm2JMuA02noY_0pWRYWq_RptqzHY,1171 +pyarrow/include/arrow/csv/column_builder.h,sha256=7oa9YCg2Uc2mB7ExHIyYIvbdt555qLXiU0y4FepkISU,2890 +pyarrow/include/arrow/csv/column_decoder.h,sha256=10idcPJE2V_TbvgjzPqmFy1dd_qSGWvu9eDkenTuCz0,2358 +pyarrow/include/arrow/csv/converter.h,sha256=cjtnz_hZFxm_dWjAMjr1iqqk1egXI2Yb8Bd0xC8md5E,2789 +pyarrow/include/arrow/csv/invalid_row.h,sha256=gTHjEbjkpee6syLGA8hFY7spx1ROMJmtMcwhXv21x5Q,1889 +pyarrow/include/arrow/csv/options.h,sha256=_HkjSoiAPW77z5AHVVnTa452y1KfJgnXWXz2NoPPAYw,7980 +pyarrow/include/arrow/csv/parser.h,sha256=8PplRh3Qxckk8VPyM70P_f1MBb4WMGnNVpoeJ9kOdHU,8616 +pyarrow/include/arrow/csv/reader.h,sha256=416pt3yNQsgn4RhIyRMsmSJmvv1sw3ouQotubXG91gQ,4606 +pyarrow/include/arrow/csv/test_common.h,sha256=uEYzw8EROvd1QMBQ98d4MaZ7BqMlw2e0flAyz-du0Z4,1972 +pyarrow/include/arrow/csv/type_fwd.h,sha256=ptVbengmY_a7Yz1w0SKmKL16yyw9yEeym0Q0cnRCSV4,984 +pyarrow/include/arrow/csv/writer.h,sha256=Y1zErZ5H1r2QzjAta3TXpFrdl2btoardCF8USCAGtGg,3549 +pyarrow/include/arrow/dataset/api.h,sha256=p7i-bncJLhmfBkfjJWS7684vD9Lke1m6tb7HQq7Tpn4,1322 +pyarrow/include/arrow/dataset/dataset.h,sha256=sDkJg42vSE05FwRmYi9pes3jD9932X3J8cyYZ3SY2jI,19830 +pyarrow/include/arrow/dataset/dataset_writer.h,sha256=TQV75b_UigfGjIpBnPk8teOncM5WroKfKV15oicBRRY,4589 +pyarrow/include/arrow/dataset/discovery.h,sha256=x7-5NBAyEeQWGlWanJDLZAoWksKiMwM96tlDx_M6n5c,11236 +pyarrow/include/arrow/dataset/file_base.h,sha256=2oe5v8Qy6v_UthJavg9rjU_WuQvwXcJengWwc3sWLqk,20203 +pyarrow/include/arrow/dataset/file_csv.h,sha256=7PlvQW_2FJ5RRN-VH4-OBw5cZ6nkd0KE0sj1TQvCZeo,5016 +pyarrow/include/arrow/dataset/file_ipc.h,sha256=6-btvXhflZsAH90T3wMkwzZkte6T4ixzeCEUn_5uYW8,4083 +pyarrow/include/arrow/dataset/file_json.h,sha256=sPjOeMOtbZZbvOivnOdb4MvYKHltpTnY8fONkhB9PZs,3523 +pyarrow/include/arrow/dataset/file_orc.h,sha256=P7nAD9nacVngDEjH8ChQRt0AQmDg4Z1wBx360LDOoSg,2452 +pyarrow/include/arrow/dataset/file_parquet.h,sha256=bzArl0XrmtTNvWhs6YTkLFxtD8TLbTIJwYmWz3YRm38,16708 +pyarrow/include/arrow/dataset/parquet_encryption_config.h,sha256=Upo0k5MijZaMaRZjPp5Xg8TRt1p8Zwh2c2tdimjVe1A,3425 +pyarrow/include/arrow/dataset/partition.h,sha256=3wrNekD_-fPO1YW91Za-T4muCfQeAX7SZRIcsCN_czI,16815 +pyarrow/include/arrow/dataset/pch.h,sha256=iAE_PbVtKHfhygz7Ox9Z2nlhsIrfageGixGKjlzNRvg,1194 +pyarrow/include/arrow/dataset/plan.h,sha256=IjuR9K2sWD85_2HpVVoJ-3YUCq--UPblHU46exX5qRg,1181 +pyarrow/include/arrow/dataset/projector.h,sha256=KfZijq09Ht0Z2cJHsrjg-sE3SiZ4TKainflReK-39cg,1135 +pyarrow/include/arrow/dataset/scanner.h,sha256=9Ats-ejc6exp3alGUhq0Sw8fww3kJj4ssi8FOKK7SDk,24598 +pyarrow/include/arrow/dataset/type_fwd.h,sha256=YOUSRwdNAlXJ7meFLolpAFQ_mSlObs2F81zcOy0DoI4,3170 +pyarrow/include/arrow/dataset/visibility.h,sha256=ckmf_sEI0WBo4W7DIgH1QrOq82skOHtoksl9B3yYvzU,1586 +pyarrow/include/arrow/datum.h,sha256=XYaZ_URrAtVqHMq-_2YtXk_ETeQ4yZWLVAnsi-k2Mac,11511 +pyarrow/include/arrow/device.h,sha256=mLz99tb74VdjxXtKt6RZCYKJQ8TYz93uaCFJ1ZiItMw,15344 +pyarrow/include/arrow/device_allocation_type_set.h,sha256=ynoZ-XyFlOAjh01PU-R11mE_EOxuw3xzc94v5OXa0u4,3306 +pyarrow/include/arrow/engine/api.h,sha256=ORM0M5KQeurjEG8Eoa5IeV_ZgKBRPlWyicyv3ORWkAY,886 +pyarrow/include/arrow/engine/pch.h,sha256=8VXXI10rUHzlQiAthx-yjHMQCpGL3dgAiVaGzTubPPE,1094 +pyarrow/include/arrow/engine/substrait/api.h,sha256=W9NB1RAm0ZVxztRXYA-GD7H8XLQNXFoYT7TdGFHoNTE,1079 +pyarrow/include/arrow/engine/substrait/extension_set.h,sha256=FE6cceycuQv7CCe_Fl4t6tIMRyfoJfWClUhSvHgcm90,21552 +pyarrow/include/arrow/engine/substrait/extension_types.h,sha256=x5ZIuynNh6WFt3wRjW--zUsuC3SeDLk1qRg9_xhswWM,3075 +pyarrow/include/arrow/engine/substrait/options.h,sha256=dtvUty_zoDmcFwVflppiDzelYkeOhCO74uRF6izQSzk,5820 +pyarrow/include/arrow/engine/substrait/relation.h,sha256=V3VKFlDdE61e1OS8LbJiwvm5w0uq5bzBLhKqmgmKaws,2385 +pyarrow/include/arrow/engine/substrait/serde.h,sha256=mjxfuFo4aPhCiwefpKAJMIlknF4UOHSr6gWU__1SwCc,16528 +pyarrow/include/arrow/engine/substrait/test_plan_builder.h,sha256=REFa79D1AOIIjp2Iez73iw5gEnzG9Rac9t8WwiGLsuI,3003 +pyarrow/include/arrow/engine/substrait/test_util.h,sha256=IHZeYrk50Sx9anJfC25DWP6XesItKEywDWUqvUJcjEQ,1517 +pyarrow/include/arrow/engine/substrait/type_fwd.h,sha256=P9YRjAQpSgoIjDC0siYyxoQzcPVo3r9y85qjiMtudBs,1028 +pyarrow/include/arrow/engine/substrait/util.h,sha256=_dRiQBaIMWNbsYG7kuXhs3dMk4dI63-pM0uSxYPOvgE,3570 +pyarrow/include/arrow/engine/substrait/visibility.h,sha256=GRzH6U-UCPT8d60cywOkFfcanPSgiZKCDP6X2rIpbMs,1740 +pyarrow/include/arrow/extension/bool8.h,sha256=VsHTtVyrqk6UKgvifad7LouuieoAZuZs_uVvegdGq4Q,2145 +pyarrow/include/arrow/extension/fixed_shape_tensor.h,sha256=VOqvTSnwDIvnhbstYX5nnqWfhtZ7MaD-lSF89BEqlhE,5610 +pyarrow/include/arrow/extension/json.h,sha256=gnJSzCVni_oJKxKMoSNBwsuBg1BJzk_goGIE_uTSMJY,2109 +pyarrow/include/arrow/extension/opaque.h,sha256=uMVqSScey_13Ho6V86vfkuoByZni9ufh5BGKgX4bTZk,2920 +pyarrow/include/arrow/extension/uuid.h,sha256=E_Bnp5KNKSxVuvdhQHjYT-0HKa9mzVPbSAQjuZ9N3Pc,2278 +pyarrow/include/arrow/extension_type.h,sha256=5rDE_IuEMAQg05k6wnbo6pu8hOW3-jp9Ab89souwcds,6628 +pyarrow/include/arrow/filesystem/api.h,sha256=Xgy2GOZtBVwDjTaXPDyPPlS9Bwt9gjWXm5I_QbyRbFo,1383 +pyarrow/include/arrow/filesystem/azurefs.h,sha256=urXoeGp29R42-0ILfkKBhzSa3U2DjjVaFmol2kOsb3g,15223 +pyarrow/include/arrow/filesystem/filesystem.h,sha256=H7MEX1259aVrWMsgsWX26tuCEPSJF-iI51J3sKsYec0,29585 +pyarrow/include/arrow/filesystem/filesystem_library.h,sha256=axaof-G9GxBjzXhRIt4azB7HB8VJ49MtGYsL7pSO0A0,1725 +pyarrow/include/arrow/filesystem/gcsfs.h,sha256=wzVfIkqhUp-aw6NFNhMbvl0bczty3HmdiYG36oPCDS8,10533 +pyarrow/include/arrow/filesystem/hdfs.h,sha256=Jn91pjfk6RMx-MuAWsEAKLTyKQ7bDPNA5jMEVzafSgc,4133 +pyarrow/include/arrow/filesystem/localfs.h,sha256=eIhPrpABheQz21WE845ULleTk83e4EtJnES4jALW6mM,4972 +pyarrow/include/arrow/filesystem/mockfs.h,sha256=kohu7s9s9xtd75sGTE2K_rsHW89swDOtSSSFxBixMcc,4768 +pyarrow/include/arrow/filesystem/path_util.h,sha256=hrDVHk4F9M7oGABB4x2wKfQMjSlSAIS0IaLVv2jHrl4,5698 +pyarrow/include/arrow/filesystem/s3_test_util.h,sha256=ffeqZmR8G8YyzbpUWws2oSEchYPBt254jwOHWdkcWQo,2767 +pyarrow/include/arrow/filesystem/s3fs.h,sha256=0C98nH3MLI-lq0FW3mWufnY8z43GWCl4BVOnhgDsFhw,16217 +pyarrow/include/arrow/filesystem/test_util.h,sha256=MFwd6ljnwR8q1smTSpVRLk_15Ch_v1hEQWkRL3lAo-s,11412 +pyarrow/include/arrow/filesystem/type_fwd.h,sha256=zztDER55Wbt4rVnkd-ReeDO-YnrpemftFeFtZ7ZGidY,1462 +pyarrow/include/arrow/flight/api.h,sha256=YotLTQn-KCl6y5BIg8coEFZ9n7PMtJ02ly7Pc5gmX7U,1257 +pyarrow/include/arrow/flight/client.h,sha256=NtFquWOaafBcmdIB4en9ua5xSEJaCBkC1ZHhAU_Gg60,17798 +pyarrow/include/arrow/flight/client_auth.h,sha256=a3Dkm_jPOuqzNsDA4eejuMUwCEBMavM8uS7w81ihbRY,2216 +pyarrow/include/arrow/flight/client_cookie_middleware.h,sha256=5zkCP2SxMFQuTX8N9NHxOve5J_ef2rFO6-xY4Tfnygk,1204 +pyarrow/include/arrow/flight/client_middleware.h,sha256=aAZwCahuiBhP85iMPe7xNWvidBR9KeHGto2YAqJioI4,2948 +pyarrow/include/arrow/flight/client_tracing_middleware.h,sha256=d0sTmUOfq5M9FMliIKK-flJkR6-7r69NjU2TpxhfqWo,1217 +pyarrow/include/arrow/flight/middleware.h,sha256=JPQd8JnIVcwjTH6yOBck4BWR-WV95fpnAdhHyEYvfKE,2254 +pyarrow/include/arrow/flight/otel_logging.h,sha256=riS9sZM2C3mH6VMbESizJ6lGmudqdJhfdCY9_cJJqMA,1139 +pyarrow/include/arrow/flight/pch.h,sha256=Dp2nrZ3t_KPjm0cIMyu913BbCorJG5rmbtpfyDN09bo,1192 +pyarrow/include/arrow/flight/platform.h,sha256=1ZfzVaollAZosGyH_1JvzEA8iNR0hi9cUGz5eyLT1zc,1209 +pyarrow/include/arrow/flight/server.h,sha256=GAcV0-THuBuj-bXfwqYrZ1P2bwZgKQSJLbu8ToltRvU,13185 +pyarrow/include/arrow/flight/server_auth.h,sha256=zKQ8lvkMBuMYiIfT1sU0MPXqVPQikaOS3npBgytcaKk,5429 +pyarrow/include/arrow/flight/server_middleware.h,sha256=ITKjCNTT2qnX7JeqWdaweC_QpCX_ytW9PFucQYmPkFo,4317 +pyarrow/include/arrow/flight/server_tracing_middleware.h,sha256=zR0FFZYGwAAqhzVhPVDjyXfZda9zmLteqauwA5dgR_w,2186 +pyarrow/include/arrow/flight/test_auth_handlers.h,sha256=XkvMWucv9GQjlt2ttvYxshym4kUubUdMh-timlQIt1I,3315 +pyarrow/include/arrow/flight/test_definitions.h,sha256=esAWPIVJxTQqGpPTxa4Dm_HdAnzK-4DoJAb3zFtQBiM,13022 +pyarrow/include/arrow/flight/test_flight_server.h,sha256=SbRhZP0U4ILnbg7lYQvGeXmvPM_B6bai12FTM_HD4RQ,3930 +pyarrow/include/arrow/flight/test_util.h,sha256=E0OlDLwcknevKf4LzzqdU3jfxUMV_mcIJxy4U_up77Q,6860 +pyarrow/include/arrow/flight/transport.h,sha256=ZDXc-f8o00TFWESwsGU1My7rR9OfM3X7OZjDcGXTwIA,12181 +pyarrow/include/arrow/flight/transport_server.h,sha256=iVdXmrb2pemh4o6BxwvB7OZAV4UeoWrbhe4ePZ5Pi4s,5268 +pyarrow/include/arrow/flight/type_fwd.h,sha256=tQFAM3QNKPdzB4VqUGdEUFjNPYXVZLApwGnSus2GQx8,1797 +pyarrow/include/arrow/flight/types.h,sha256=b_HQAdmPTh8sZsk5KI7diTMlfm5TmnPFgc8sHE9KFWs,46638 +pyarrow/include/arrow/flight/types_async.h,sha256=3nIQqwCYO4Ir3Mt2bG7BNntXxuNHYQNNpz-Yl3EaFTQ,2599 +pyarrow/include/arrow/flight/visibility.h,sha256=N1k74cwyRvOaYFa_tCjdgUjiSdPBhmy20UuVGu0wTg0,1596 +pyarrow/include/arrow/io/api.h,sha256=Pn4jZSTsLW8MAlMyXUokmJdupX54u154GYI5AvD5ByA,996 +pyarrow/include/arrow/io/buffered.h,sha256=YFKKAHStUFncnfpwnk0XSZAZLeLX-LAXV1qH9VGaE1k,5845 +pyarrow/include/arrow/io/caching.h,sha256=AAjoyKwQ06m2XiglFS6Ch_cdg2p4-wkA7GakGI_eX1E,6708 +pyarrow/include/arrow/io/compressed.h,sha256=3JxIOo1q8VhjIErfwVM5ZLVkwwQKXd-FT5517j58etA,3774 +pyarrow/include/arrow/io/concurrency.h,sha256=SmIr0OWCgMUR3j9ngVbjMJhWOUrU15jQf_jz2rUw7r4,7934 +pyarrow/include/arrow/io/file.h,sha256=-ZEklW1Q0sj3pYCQLQ1ebirKd3s2GI3vUEIszFr8mVU,7625 +pyarrow/include/arrow/io/hdfs.h,sha256=2s3f49ggAYgSCsX5SoqnomwsXd24_IZhW-VSBJclqTg,8559 +pyarrow/include/arrow/io/interfaces.h,sha256=QIBHTJUobEkwcqnKMT_GEKu5ArzpeGmK-8v7z4qGHIQ,13428 +pyarrow/include/arrow/io/memory.h,sha256=htc3MmEbEvwc28bLjCtTtt9QcYp-10WKLmX0V9TnwRM,7048 +pyarrow/include/arrow/io/mman.h,sha256=qoLBAGFcvpYTy96Ga7FNWDJKT3uhxpFAF3hbXIaDSiY,4111 +pyarrow/include/arrow/io/slow.h,sha256=8-ZjQJq49EQJ4esQ6qHHjlKCeZNg4BSND7ire-ZtLYQ,3942 +pyarrow/include/arrow/io/stdio.h,sha256=dqMTHoJbmiXcyNa2fN60tSWQsx0GPphZVCLdGiZNt8I,2095 +pyarrow/include/arrow/io/test_common.h,sha256=Rj8mwgcUkzksrlBALiAldtr_6JGHJFLh2SztGVkRiSA,2112 +pyarrow/include/arrow/io/transform.h,sha256=W9XWonw69VymQAaQptfW7jD-6ry7VCpfPXlkB7aZzOE,1890 +pyarrow/include/arrow/io/type_fwd.h,sha256=Pi7EFpFvBXsFN1xKOyZjTSP95xNDs6W5hxb5GucoVVE,2315 +pyarrow/include/arrow/ipc/api.h,sha256=olkdu82mTS8hmwD53DBJJL6QQ0YBplhs-s-m4uOInSQ,1007 +pyarrow/include/arrow/ipc/dictionary.h,sha256=UTjZPIG8mLZOk9IW2QnR9RZGr1npexZOp103fv-O70E,6104 +pyarrow/include/arrow/ipc/feather.h,sha256=uCnxwO7eUH18kJ-lWz9IWwSj6AjfejqqLdoifJ-UBDo,4918 +pyarrow/include/arrow/ipc/json_simple.h,sha256=IjFjx6Z7h_WLXt1paVIJboUOTR5GFBhWUhCbm_m9lNk,2455 +pyarrow/include/arrow/ipc/message.h,sha256=KtMCbIC2J4-5iyPG5Sijqu_MALxiuKWBYZhGnw0jxOQ,20011 +pyarrow/include/arrow/ipc/options.h,sha256=X2BbCaQ03S1uqedgLRbvLyfb1PHZ7WGRBjDLLCbQMGE,6888 +pyarrow/include/arrow/ipc/reader.h,sha256=NqdrqqAEItO1ecYUINRO7-qhKlYy-CHSJKGI2hdXlRQ,24106 +pyarrow/include/arrow/ipc/test_common.h,sha256=_kWOR_-YKtilcCIWK6I4WYo8fcRt6eBMfxEM4kDtY20,6351 +pyarrow/include/arrow/ipc/type_fwd.h,sha256=Ty8ET7nLI4JJeTqDMyP0pEH9QVj9xs7BpJkZrnrpaPY,1440 +pyarrow/include/arrow/ipc/util.h,sha256=wTkfC9YFKZlAAjyzlmQVZcW90oOj_JatjDN4qz0IxHg,1414 +pyarrow/include/arrow/ipc/writer.h,sha256=hum8E_orkG_X38vgyfyKhGbyvcLJ3AkXEykyBjAXIYg,18870 +pyarrow/include/arrow/json/api.h,sha256=XRW1fP43zVqwy1yabaKctNK9MDZqnxkoHDH1fx5B3Y4,879 +pyarrow/include/arrow/json/chunked_builder.h,sha256=DDuMwrImMECw6Mhfncn2xMOjkFcKUV1O1597_fSFSAs,2365 +pyarrow/include/arrow/json/chunker.h,sha256=dkZOcxsF1Q3ek58P7IoA8f3lQyBQpFvGSFeynNV2Olc,1119 +pyarrow/include/arrow/json/converter.h,sha256=3lXsP3BSdpLPIkFAJnYW9vP8BbX3neVYR_W0zFKClQ0,3134 +pyarrow/include/arrow/json/object_parser.h,sha256=Y_6Oceya06aUyeo-1k047dm2-JUMJa2_w9iyZ-goIRQ,1627 +pyarrow/include/arrow/json/object_writer.h,sha256=UrIrjCkIz7Q5n_FpV5NNPD96gHHdTkvTJaekuGBHwTo,1428 +pyarrow/include/arrow/json/options.h,sha256=EypQgDwLZQbrPnAh45nSPfpGGYrxvLgfp1eAG_l0p3Q,2227 +pyarrow/include/arrow/json/parser.h,sha256=3oIzO5kUs2Takc7t_d5mH7bp1uIcc1M-qbuHmPoSI34,3383 +pyarrow/include/arrow/json/rapidjson_defs.h,sha256=lBJlfuYWIeQQ8awPd3bk4jJc81efr_KzKwG8Klw7t1s,1474 +pyarrow/include/arrow/json/reader.h,sha256=KNO9dCyc2RZs7WxUSEW7bpCYBh_h1C3U52YHYxBnP0M,5212 +pyarrow/include/arrow/json/test_common.h,sha256=YiiY_jswpp7Nu6IW1Y2lBhqWSFRoNaNEy1jHd5qkYHQ,10874 +pyarrow/include/arrow/json/type_fwd.h,sha256=o9aigB5losknJFFei1k25pDVYZgkC2elmRMX1C6aTjo,942 +pyarrow/include/arrow/memory_pool.h,sha256=SjPtWz1tx6Lotr2WeOKCCIw9NQc50Zjez3yzgfr7SDw,11064 +pyarrow/include/arrow/memory_pool_test.h,sha256=qv7csk6hZiO2ELFF-1yukpppjETDDX0nuBFBbPFHtMU,3350 +pyarrow/include/arrow/pch.h,sha256=MaR9bqy2cFZDbjq8Aekq9Gh1vzLTlWZOSHu-GhWP1g8,1286 +pyarrow/include/arrow/pretty_print.h,sha256=ZDlroPRr9_ryCk7h_rjA8pL7BNgaJQ9HnRb2PZU63lg,5529 +pyarrow/include/arrow/python/api.h,sha256=W76VAxYqOxi9BHJddji1B62CmaWDFuBhqI65YOhUnGQ,1222 +pyarrow/include/arrow/python/arrow_to_pandas.h,sha256=jUBEUMKXw70oJdMlgkSf6HitaNweQcc7hxI75_C9WSI,5561 +pyarrow/include/arrow/python/async.h,sha256=C0f8YYmgwBGgDau4xEFsdjukiZB4YvpylETHEZryHOo,2352 +pyarrow/include/arrow/python/benchmark.h,sha256=f-kzyMOlPKDse2bcLWhyMrDEMZrG_JHAPpDJgGW0bXU,1192 +pyarrow/include/arrow/python/common.h,sha256=yjljfJK1f7slZ7DBQ4LTo_pob70zioswJNWazy0p-uM,14412 +pyarrow/include/arrow/python/csv.h,sha256=QxU3B-Hv_RsoEcMGS9-1434ugouL2ygC64Lq6FgviNM,1397 +pyarrow/include/arrow/python/datetime.h,sha256=Bny_THGi2tyUeHxcOuw01O7hNE8B_gave5ABAZQtwTQ,7931 +pyarrow/include/arrow/python/decimal.h,sha256=kDDjLzW07D7d7omWSR4CBF1Ocskp4YSZu4Dtxu-gRUg,4726 +pyarrow/include/arrow/python/deserialize.h,sha256=Q4L1qPCra8-Wzl6oLm44cPOUMVuK1FX01LeGzwNUtK4,4260 +pyarrow/include/arrow/python/extension_type.h,sha256=0gzb42y_mbw4fsYs3u8cwPFLBRlG-kkHQLgbvGtrY0U,3181 +pyarrow/include/arrow/python/filesystem.h,sha256=FG0AcLekqaDf9IQPqKixAfIcY_ZLgIKP5NvvXdtBVUM,5126 +pyarrow/include/arrow/python/flight.h,sha256=u5UnulNJqMuXQLlODUWuoyxq-GtL1HuHmVGNzobUVGc,14311 +pyarrow/include/arrow/python/gdb.h,sha256=H-qvM-nU8a_3Z5tk8PvppTwQtBMSZhQKQIVgRAsRfFg,972 +pyarrow/include/arrow/python/helpers.h,sha256=jVNFEbvJXmCceJti3J3-MnZkNlJoynQNq334tt29bbs,5489 +pyarrow/include/arrow/python/inference.h,sha256=FUFvB4Zy7V-tueXdmbDcqTeLK4xj5GZEeRW5yhiJlsU,2038 +pyarrow/include/arrow/python/io.h,sha256=4jGnodpSUlnVqAVh9fWId7H4WldlLPkXyroABpdaW6w,3858 +pyarrow/include/arrow/python/ipc.h,sha256=SZbw6jCCqLiLNCY3k632GmwHeD_r_xrDS0dhqV49VhY,2259 +pyarrow/include/arrow/python/iterators.h,sha256=Ugfm3JvetAH0l-oAjjpZfhrUBqRimVMaw4-xusvqLSg,7327 +pyarrow/include/arrow/python/lib.h,sha256=UNSuhntc2NTo9y8txHS8MqB10IQN41UuXjb5dGtstfw,4631 +pyarrow/include/arrow/python/lib_api.h,sha256=SCXALS0e94-_uXt9ZlqlUlvU-cclpx7xT8LpxAU1nbM,19487 +pyarrow/include/arrow/python/numpy_convert.h,sha256=y13eHwfe1lJKzadoTr2-GyX6xPsE6Z7FN31s7PN-2Rk,4870 +pyarrow/include/arrow/python/numpy_init.h,sha256=FniVHP7W2YBlenoMYhQrODvoqqvDMSls2JANGtNPQts,999 +pyarrow/include/arrow/python/numpy_interop.h,sha256=rI6ek8JTOYtjo7gEADSDBS6QuAOHa2A0YQPZ2GeypFw,3418 +pyarrow/include/arrow/python/numpy_to_arrow.h,sha256=z9KapsuoOSpWILPt9bea7GR4BL6AQ28T6DUO0mSkh3k,2760 +pyarrow/include/arrow/python/parquet_encryption.h,sha256=Mc8tZ8gIfkH0AckNiIOt6hesP_MVKeKhcytT24ZOLdQ,4861 +pyarrow/include/arrow/python/pch.h,sha256=vkbgStQjq820YeHlXBPdzQ-W9LyzJrTGfMBpnMMqahk,1129 +pyarrow/include/arrow/python/platform.h,sha256=XYS5IqiMUejxN2COzu60Zs8b_wAaGTBw4M-zKVqqs5U,1422 +pyarrow/include/arrow/python/pyarrow.h,sha256=TK3BtD9n3QKOQ9dX3LXbQc0hu9alWcufV0O93iQW7B0,2761 +pyarrow/include/arrow/python/pyarrow_api.h,sha256=7l0G4-_m9yALYoifsY8Z6qh3HHD0PgkpVSgCn_JaGU4,867 +pyarrow/include/arrow/python/pyarrow_lib.h,sha256=-70_Ckj3_0ImlzaXSJOE_d3w9pGM66lXiGPyln9c96Y,863 +pyarrow/include/arrow/python/python_test.h,sha256=ea32mM20uHySlygi9MtVxr26O-ydTZHCUQIlxaIMjT4,1195 +pyarrow/include/arrow/python/python_to_arrow.h,sha256=BoVytf6P7PBYXyznchElKZSFvEsFyimB-tLFdw0AUNo,2521 +pyarrow/include/arrow/python/serialize.h,sha256=HVBhIKgc7A4YOmwYfjE2Hqj1Yxl9suCJb940KxrVcrs,4630 +pyarrow/include/arrow/python/type_traits.h,sha256=B_NsRT_hZG8D91sTcihJyKF5SrslPcFmj12QfbpHuLI,10093 +pyarrow/include/arrow/python/udf.h,sha256=de3R8PhNJO5lT9oCqRxe8e2_SE3jBpHOkwbNqCrlgjQ,3104 +pyarrow/include/arrow/python/vendored/pythoncapi_compat.h,sha256=bzMnlHTCfjk5DQRIxwytunYh5aQxU3iSElaaDyNnAY8,40900 +pyarrow/include/arrow/python/visibility.h,sha256=hwJw5sGrWJckQkNaAuLe4Tf-VDjQbXknyzNOVgZI3FI,1381 +pyarrow/include/arrow/record_batch.h,sha256=qk-6MakursNrRIec5MZeCfjUSYyXPQsyYbB1FJcYb7g,17835 +pyarrow/include/arrow/result.h,sha256=1NmZkkVhjVe1CAI7dFXRFdNQefEtk1lxMCF92o41ROE,17739 +pyarrow/include/arrow/scalar.h,sha256=7SguSvJ4wohjqV-FSKboC1pgaFVB09dNfkMz36JjcM8,36543 +pyarrow/include/arrow/sparse_tensor.h,sha256=dd6eQmCjfCmmI76hgsC37R-qPJ11IMhafVaxSo2XJFs,25205 +pyarrow/include/arrow/status.h,sha256=2D-uFQpe83Yja8Qygm1cXvWAybuiibyxlavOxFuPEjs,16417 +pyarrow/include/arrow/stl.h,sha256=yGoKi-YUq6DgxkIW27S5B0_rXd2YiUrdzA1YdvHNCHQ,18164 +pyarrow/include/arrow/stl_allocator.h,sha256=TBbvjbuQIH9y88FI2SaqAL7pOIt3wZ1xMKwXqeKNiJE,4956 +pyarrow/include/arrow/stl_iterator.h,sha256=2nzrza4st-mdii2dqBEGCzql07t-M3rbDQjvzm8S7sY,9963 +pyarrow/include/arrow/table.h,sha256=UoixXGk5S1ckV35utXjbA-KUBQrSeqvfrhSmk22k760,14647 +pyarrow/include/arrow/table_builder.h,sha256=LRcLCL2iUrj6vF4f9AjPswVjqtqlMw7z_8VBAfUJeCo,3763 +pyarrow/include/arrow/tensor.h,sha256=mgPkJ5f5ngl0qDkeYf-uk-BtX7Gyr-0DUuX1qB6YadE,9093 +pyarrow/include/arrow/tensor/converter.h,sha256=RZq0Try_kiZ085_d_CvhewMsd57InGb2TCeiveaf-Oo,2891 +pyarrow/include/arrow/testing/async_test_util.h,sha256=IrHWfPeIyhrgeTGHUPLt92LdsofmFX6khjngWsZv3dY,2262 +pyarrow/include/arrow/testing/builder.h,sha256=4x0bWOedaVomWU0m7dF99irOv3flR-_p-IMofTDZtwo,8556 +pyarrow/include/arrow/testing/executor_util.h,sha256=38_rF-V_9zF1ttJMspkPiI-34VU1RDjg1ADBS8lUFHk,1885 +pyarrow/include/arrow/testing/extension_type.h,sha256=5l_28-SdoO0r6r-nVqkXsfSRFWTLTPgOFEpXzZiqh6U,7430 +pyarrow/include/arrow/testing/fixed_width_test_util.h,sha256=g6yB7RkziU7HEhNJnxOhkn2nE5HeXaFX3tbBX3q9_sE,3091 +pyarrow/include/arrow/testing/future_util.h,sha256=qIhi417OGMWSMUSDHjkGTYd-ihZbqw8ZSIRwJ01vbKg,6246 +pyarrow/include/arrow/testing/generator.h,sha256=h9Kw9GfDnCHDLl7IsEgaLCi8UDu7R6MHL7Au2TWfMVc,12024 +pyarrow/include/arrow/testing/gtest_compat.h,sha256=0NqH39my7m1FMpsrQYnxQx4bdEE10SCXZaysN6yjQFA,1311 +pyarrow/include/arrow/testing/gtest_util.h,sha256=jnVGbM53nnXO433aUNmZHlMyiQ1ftENITLbtqRF6R08,24496 +pyarrow/include/arrow/testing/matchers.h,sha256=3ys7UI6YpFeMvFCgjmF_VWn1w7Hzhqbr2c-_EuJBpnU,16852 +pyarrow/include/arrow/testing/pch.h,sha256=wKPN4rZnVcQbmpn02Sx5tSa7-MEhpUR1w-YJ6drtyRM,1164 +pyarrow/include/arrow/testing/process.h,sha256=AzPW3Lh2R4sTm-RUUi4Od3aSba9zoLcS_zHBxztv4zI,1372 +pyarrow/include/arrow/testing/random.h,sha256=UMxioQORvoZOsodZM6T-ujza5WuYKwAndbvnOImDsqQ,37046 +pyarrow/include/arrow/testing/uniform_real.h,sha256=-G_2J9cvevoCtB55vsCsWtJkMUHLIMyOwdT6G8ZW45Y,2970 +pyarrow/include/arrow/testing/util.h,sha256=Vr_F5jZQo6kd2-PBq5M0IjODeuaY7cNU7dDovpnPtLQ,5391 +pyarrow/include/arrow/testing/visibility.h,sha256=-wjc00QIhygXJa7tknbIL685AQ1wnyCPr-EtVzkzmq0,1606 +pyarrow/include/arrow/type.h,sha256=yxS8FpeY8eK4cjiSc0nBPdKAF8CRnu7cpR0VX_BCY74,96772 +pyarrow/include/arrow/type_fwd.h,sha256=2stweTjQZvCwuWYBFI_QJu2369tT6Y1Az4AIien0NVU,23442 +pyarrow/include/arrow/type_traits.h,sha256=5XS-cpIzY1DQmNIwzhL7zd4ItxPfOgCwEqWfVG-zU80,54725 +pyarrow/include/arrow/util/algorithm.h,sha256=045EVzsC9rThlRVFaCoBmmtWZmFy5y28PR9yapn9sXY,1229 +pyarrow/include/arrow/util/align_util.h,sha256=DG2L24KReTiU8nFpXLigbflkKouKWTPUf6osQs6mxiY,10669 +pyarrow/include/arrow/util/aligned_storage.h,sha256=ZsAqIA3DV3jIhCnC8mmA4J7FCnnQ-CV-gJj_T_pTmsI,4987 +pyarrow/include/arrow/util/async_generator.h,sha256=dMfy3t58k9zQ82LeD002LZT0uEce_QWoDRfwjIapwKk,77704 +pyarrow/include/arrow/util/async_generator_fwd.h,sha256=Y7EZ4VXdvqp7DnzG5I6rTt123_8kQhAgYIOhNcLvBdA,1737 +pyarrow/include/arrow/util/async_util.h,sha256=1nnAJZ22iK7wSzmvZDo3PMhuWqJIt2qKdlXzTyhoCK4,19759 +pyarrow/include/arrow/util/base64.h,sha256=qzcBE98cg8Tx5iPJAvQ4Pdf2yc6R2r-4yGJS1_DEIeY,1095 +pyarrow/include/arrow/util/basic_decimal.h,sha256=3kDphzHx5TxSnbeZJtRDN5R3McKxxjMMePvGZElWNCI,33123 +pyarrow/include/arrow/util/benchmark_util.h,sha256=SG3gfwE-wGNZAwpL3TvffnSiZGM2cztV5xRBnbqy2Mw,7641 +pyarrow/include/arrow/util/binary_view_util.h,sha256=-sFAQX9cnfWmmZJo8stFX5vkJki7T2UloAvDzYO0MN8,4625 +pyarrow/include/arrow/util/bit_block_counter.h,sha256=iSIemzizxVokwC0Ze6SjSi-al_nrP2ViXF6JPoIVUWc,20162 +pyarrow/include/arrow/util/bit_run_reader.h,sha256=IWDww6Dm8OFsCRlJ0hEpJKiHMK3nUM3pqbd09mZhcIQ,16616 +pyarrow/include/arrow/util/bit_util.h,sha256=S0TbReZet8MpPFZk9wjfYzfKpkBquthkkFk2QtxzB7U,12108 +pyarrow/include/arrow/util/bitmap.h,sha256=qDoNl-S8QFoZ220HsAtAN-s-Xm5JcnjOXNOGdaIssL0,17462 +pyarrow/include/arrow/util/bitmap_builders.h,sha256=zOb7Q-eX9vm9rkgu0Z3ftUDsI1xPthxJ_iC4qDYR1is,1563 +pyarrow/include/arrow/util/bitmap_generate.h,sha256=m6ZsNwx1GhsEktQr63NxXHQkX2B7Nti011XYsPg2xfo,3661 +pyarrow/include/arrow/util/bitmap_ops.h,sha256=87_SXoqmVPRC6umXFitektDCIeI8yOalYWUonzdWjt8,10750 +pyarrow/include/arrow/util/bitmap_reader.h,sha256=pLrMDWhVo-Qb3V1mLASAz_aI6QZxDHRr37EtqxqGd9E,8353 +pyarrow/include/arrow/util/bitmap_visit.h,sha256=myn8k66VrvZnL6R6VW6IDPTfO68VxjbJ8Up5IuSjFL4,3470 +pyarrow/include/arrow/util/bitmap_writer.h,sha256=a4goXhLlY0qcfvYxbfbGD_HZ8Au1wFcbV1tVF3BPaXs,9383 +pyarrow/include/arrow/util/bitset_stack.h,sha256=D49IZZSzZOM2hqh6b-fT0vgRISf1mQnl4oG5nnLBZ4A,2776 +pyarrow/include/arrow/util/bpacking.h,sha256=qiiYXgZLWZcYX6sm75_vBQ6qpHtS1AwasL59YQL2Ptk,1175 +pyarrow/include/arrow/util/bpacking64_default.h,sha256=q7kf_BW62k45v1qMtnJtLIPk8VtJIALc5nXkYmISy3w,196990 +pyarrow/include/arrow/util/bpacking_avx2.h,sha256=ymQJGQc54W3zbrSoktjbAcBnWwbq_SphiXLLI-G6fHg,1009 +pyarrow/include/arrow/util/bpacking_avx512.h,sha256=Z_rAQpiKJEH-9QSHUXpbDmZiAgIm7CPCHfPnwlIZDAE,1011 +pyarrow/include/arrow/util/bpacking_default.h,sha256=nDi4g5JdyWwXa_J3EqE22bG9R4G7Czd6W75F9spRU5U,103760 +pyarrow/include/arrow/util/bpacking_neon.h,sha256=vE-V4E8dpqSjk7dq8kagD07-nhRQKGvcYMhc_dE4nqg,1009 +pyarrow/include/arrow/util/byte_size.h,sha256=Pd2c_3a0IeSOUevhPIlXNkDmgoB06g4c9YCsuRwwSKM,3997 +pyarrow/include/arrow/util/cancel.h,sha256=oW33c4AXSKLHUc5R_1mZ4ssjmLXU_P0Jk6GDO3IwZUo,3651 +pyarrow/include/arrow/util/checked_cast.h,sha256=SR9Qg8NuLSBJw2w1UfgeGvCfT8k7wrbN7BzADQOZfAU,2076 +pyarrow/include/arrow/util/compare.h,sha256=OLrSSyllkY4Sv00IK-37A2d68gr4OwnWJsxn1aF9xTU,1982 +pyarrow/include/arrow/util/compression.h,sha256=fvlURoWJsgO8Hr6Xs_VNaqiOatmIGn9ktVUkYv7pIu4,8427 +pyarrow/include/arrow/util/concurrent_map.h,sha256=wMi9WDHfRuJ_aSFgcJPpsVwGJ9vIJ5agaZ3rVUlwGe4,1775 +pyarrow/include/arrow/util/config.h,sha256=NGJWc5QVW-IoYYSPAxCj6xzP35y1U7HGal0CXfdY1Bo,2278 +pyarrow/include/arrow/util/converter.h,sha256=PILfos6VlnLK6fOFMfLIUhiKl3o1dJo9T4HJXeR7V5E,14637 +pyarrow/include/arrow/util/counting_semaphore.h,sha256=iXHYagqi_-ay73T1uPmv7pG334SY34DUQLSdtD_4_tA,2251 +pyarrow/include/arrow/util/cpu_info.h,sha256=MqLdJabBZkzDjiScaQ7if9dmoAGvXT2QavGoGkho3lU,3964 +pyarrow/include/arrow/util/crc32.h,sha256=4gN0M-SRnxaGKci2ATPbMWZG2TG3YULXjaTpadV0Udk,1337 +pyarrow/include/arrow/util/debug.h,sha256=CPB_oDOuZ_u89e9wM8bGn88mGvClgfa7UDxDph6v9sY,971 +pyarrow/include/arrow/util/decimal.h,sha256=ozY_pRsBgftG73qz0KKEPchFQ5HRTb5oxCcTIdWEL7g,20831 +pyarrow/include/arrow/util/delimiting.h,sha256=JYe9YcWMeFT_ISuojx_VgVqOYLvZ2TiiR2sNn-WdeBQ,7317 +pyarrow/include/arrow/util/dict_util.h,sha256=HipvAVlQ1Q6zNneu9tYOwVUv6NLklBu2IfZ1eoeSpVg,986 +pyarrow/include/arrow/util/dispatch.h,sha256=g6R9w8asCTRyDTFoxUipvdOeh6Ye_FvZBGP6Zwg2t3M,3235 +pyarrow/include/arrow/util/double_conversion.h,sha256=23QU2TFX4hpBZnoqMDyTKxZoH7mU9qkY2vkF1KL8bW4,1243 +pyarrow/include/arrow/util/endian.h,sha256=jp4QoQ9r2vb-oigrlb9AhQW7Lxgxjj7desQjzkEre7g,8176 +pyarrow/include/arrow/util/float16.h,sha256=RaJBIWnDdqj7uw2YskxBM0Wlpnrq7QRbMCiTZLr7gJY,7418 +pyarrow/include/arrow/util/formatting.h,sha256=782wKN6ZKlHO7cQLC8CKCF9STixvLGjXrp_CwRqXyVs,22554 +pyarrow/include/arrow/util/functional.h,sha256=4ljKXSWX3G_lBT2BfLXuG44pzZwVKeaojpLWCniqKyc,5612 +pyarrow/include/arrow/util/future.h,sha256=tsSVDEH2dhXKyvIKl6R9BVBolpPdZXoRRf2-YRbtdxg,32296 +pyarrow/include/arrow/util/hash_util.h,sha256=CjiNVPUJPxXvVJy7ys79aIb7YB6Bm-5nTJAR4DHsxcs,1918 +pyarrow/include/arrow/util/hashing.h,sha256=baLrNZVhO0choWat_Bie2OV821WSTiutqIVfDMjYO6o,32892 +pyarrow/include/arrow/util/int_util.h,sha256=zTOAq57M4pUe469WpnW6I5hNtxe3vGRHlZWhngA1DzM,4859 +pyarrow/include/arrow/util/int_util_overflow.h,sha256=AtvkG7v3-1gVzW5SrFrdVkYuXFtT76_nxrKtzIbz_9U,4895 +pyarrow/include/arrow/util/io_util.h,sha256=U6VTCh0yKUmYPaw2oG-CllJd4J02Gce6b0qTfqFi9E4,13709 +pyarrow/include/arrow/util/iterator.h,sha256=nprqdPs6wrrgi6RHIJ2VMQI1YFya-57wBQfOEmHoKUc,18087 +pyarrow/include/arrow/util/key_value_metadata.h,sha256=wjU6uQGcSmy-YFqMs6rwLP7E4X-0IFBjPrWZstistzQ,3590 +pyarrow/include/arrow/util/launder.h,sha256=C3rNBRh4reuUp8YuRdGQU95WPc8vl4bAY-z5LXgDiuA,1046 +pyarrow/include/arrow/util/list_util.h,sha256=_OmtsDqe-mnZ_7tVWxB2yHdgCJhpiME_RP3nXHzKbdI,2028 +pyarrow/include/arrow/util/logger.h,sha256=p9i4dNgne36LWpFmNSYBYgTQ4kFSao20dJ40LgRRZKQ,6693 +pyarrow/include/arrow/util/logging.h,sha256=eY1sZ1QCcvy5lpJwfOCL2rtRgLjc8V8yDf9usSa9-d4,9694 +pyarrow/include/arrow/util/macros.h,sha256=dqnFiDUrFUyqHyNP4xEr54WgaAEXX8gE4ZG7-i3nfZQ,9336 +pyarrow/include/arrow/util/map.h,sha256=KbKB3QNc3aWR_0YU1S7aF9fdI0VCABGxEF1VES2oOqU,2476 +pyarrow/include/arrow/util/math_constants.h,sha256=2sfWoVc8syHz8X26XgBmejzXStl7hmvKiOh9622oUZA,1112 +pyarrow/include/arrow/util/memory.h,sha256=qsxFgvj_wozO5OxIs6fHdcam7aifpozqc1aE81P91Yo,1566 +pyarrow/include/arrow/util/mutex.h,sha256=n4bsrHK2Q8zbYsQEyNaFqNu__vvqgwo1AfrLLCxfkpU,2554 +pyarrow/include/arrow/util/parallel.h,sha256=iZBn0C7HkQhGNKET5WTXCJ2FftcryCZAyBGwcg7qRvo,3616 +pyarrow/include/arrow/util/pcg_random.h,sha256=nbXowfCJFiy4GjVfF9I8VvB6fxkyR5zNB1FKdnFsYTQ,1252 +pyarrow/include/arrow/util/prefetch.h,sha256=vaE4FPdscbtO0cPbzl8F1PzB1NDO18ytYlEmZCHDjHs,1251 +pyarrow/include/arrow/util/print.h,sha256=X0CfuWzDkq8CNHaEUH3I27Yi4v_zdoOo7sdrTad8Wr0,2444 +pyarrow/include/arrow/util/queue.h,sha256=X9vRZQX3YL_a2Lzwe-zcNNHguR7FoGYmD-Q0THqsCBM,1017 +pyarrow/include/arrow/util/range.h,sha256=yhe5pJiZIiLUO8tYr408Y9yEsFrFd7FrBMeTL2hAOKY,8526 +pyarrow/include/arrow/util/ree_util.h,sha256=waTBOQfwWGHhoAYHTyyhUnM2BSwOqsof_H_akHvUgno,22395 +pyarrow/include/arrow/util/regex.h,sha256=Tj92CttOh2HxS0EKQ_9-sxMBAsQrDOUKNP0ngIJFdP8,1742 +pyarrow/include/arrow/util/rows_to_batches.h,sha256=PZNoLeMCfJJdeHVvUny0UHc5AtS0hctUCi7zUztJpeE,7120 +pyarrow/include/arrow/util/simd.h,sha256=PpKm-aWpZYIYP0NnyGrQceOO9m3_7JbN4uro0IhIT9w,1679 +pyarrow/include/arrow/util/small_vector.h,sha256=dDNNMFpNdtIbxLP3L-h_bv3A8raYv4IVuyLEzUVMgck,14421 +pyarrow/include/arrow/util/sort.h,sha256=cXZvBN_EcXkN5j0xhX2oNisbChT2QKXP9KzDgjXW2_M,2466 +pyarrow/include/arrow/util/spaced.h,sha256=790FFCTdZA-z6qKuEJM5_wG24SqTTVtyj7PKnLBe7_Q,3567 +pyarrow/include/arrow/util/span.h,sha256=2zDPUc5ciTQovM-T32EZt4iMpqcsoL7Y46ovKjo-7ro,5551 +pyarrow/include/arrow/util/stopwatch.h,sha256=ADGbEEU1x-fvp_NsIdTHH5BW0b9jDB8rTAj1WOgkClc,1401 +pyarrow/include/arrow/util/string.h,sha256=hYtg4d3kGQBHdd0vGuKJTlVeueCCgfyD3iq-feMA3p8,5756 +pyarrow/include/arrow/util/string_builder.h,sha256=UwOKPz8BQjtl9ecBZ0INoYWMWUkAVQOd_aC8xZZMCgo,2446 +pyarrow/include/arrow/util/task_group.h,sha256=fI330NoJT8u84AEUA6pSxWrE7UBKn2LaM4DfPFoalqA,4362 +pyarrow/include/arrow/util/tdigest.h,sha256=L6nSj-FVlYLtwKJ94WX9qps9YU6Yg-e3xwP6C0qE7pw,3058 +pyarrow/include/arrow/util/test_common.h,sha256=ZniLT8TvAUdCE2T2YrtlDKdwDNPBpT5e9V1EiPHH9LU,2837 +pyarrow/include/arrow/util/thread_pool.h,sha256=4ztLwkJHQJQmTmqwy8IGDmAo8X4N-o3qi6f91agzkkQ,24426 +pyarrow/include/arrow/util/time.h,sha256=4Xi8JzaYlWFxVaenmCJ7orMgu4cuKELvbtMiszuJHUA,2988 +pyarrow/include/arrow/util/tracing.h,sha256=sVfC_Rj2gwkWKVSKT0l0FOO5c2EGsfYwlkZX4d9ncxA,1286 +pyarrow/include/arrow/util/trie.h,sha256=WBvryYO2sNdoPc-UB-XmQ3WzSed79qIsSg7YWCrvwNY,7121 +pyarrow/include/arrow/util/type_fwd.h,sha256=aC3ZZR2FniFUR3InlZDXH8dknZKvmM0RBocHwFKU_Us,1521 +pyarrow/include/arrow/util/type_traits.h,sha256=F0Gdg_3faM0MmZBOXOspRzUwuxnjKbFaVpJiTEaOXGU,1731 +pyarrow/include/arrow/util/ubsan.h,sha256=dJNGOe0smDe1akrYLdYcIbAWDJNS6Z7NRgqgDnr2emc,2765 +pyarrow/include/arrow/util/union_util.h,sha256=PSssBiw-v-PDen_q75c6OkNO5PwyIPhGbf9PMJj7P2M,1211 +pyarrow/include/arrow/util/unreachable.h,sha256=O1TG4ozCYT3_xvDpJouKWrlFADIEpIemQ28y4DqIwu4,1070 +pyarrow/include/arrow/util/uri.h,sha256=D24zebazFcrKGt7iGpkcGQ87DuF-2AbjPKVkDlq9Nuk,3886 +pyarrow/include/arrow/util/utf8.h,sha256=flGZ786kHo33Xg_zw0zVA9GAT8jYdPUHTVhIPHGjOj8,2031 +pyarrow/include/arrow/util/value_parsing.h,sha256=ypbnIIxfFDfDmELinEiS2RYSkeabYDAfuKPW5YsmfRw,29995 +pyarrow/include/arrow/util/vector.h,sha256=w1lxZG3CU0gq2ZrByeU8QX2A0JeTtooGdaZONUsVlfs,5697 +pyarrow/include/arrow/util/visibility.h,sha256=DFEdl8TCr30r3b7vlpgzJIiA5NsK7eW9UmeL47PgcLk,2835 +pyarrow/include/arrow/util/windows_compatibility.h,sha256=Chme9fWRqYRzfIbLw7V_yeiIWd3F4dFeG6ImHHr4Xqw,1255 +pyarrow/include/arrow/util/windows_fixup.h,sha256=hjoh6zvB8u8OVUQqLtdcrmohMzoAoLy6XJFLxcfFhK0,1435 +pyarrow/include/arrow/vendored/ProducerConsumerQueue.h,sha256=Bz1ks3NDgXXLfT8TMUkE38RpMOSwKRRtwU1e37Y1CUw,6101 +pyarrow/include/arrow/vendored/datetime.h,sha256=lVHO-GyyevnRnc2XmnRS33plbC7FGKcPJk0jnWrgLxw,1017 +pyarrow/include/arrow/vendored/datetime/date.h,sha256=PtXn3ecSQSMqcvMpphq4GVx49dXQbsfDQb5NMlr1j9k,237838 +pyarrow/include/arrow/vendored/datetime/ios.h,sha256=SSzUcU3-1_slQ-F8dS8MPMdKyhSmXKFmvSiUF3Wuaoo,1679 +pyarrow/include/arrow/vendored/datetime/tz.h,sha256=k3-r1rO0n-u33qLAb9sKrRWPtb9JXGIW5zXAfnTSiEw,84884 +pyarrow/include/arrow/vendored/datetime/tz_private.h,sha256=OmiXT6TLewxKLT8JdCp42FJTf-gRYi5ylGu_Ks01T1I,10732 +pyarrow/include/arrow/vendored/datetime/visibility.h,sha256=2P38U5rN_wE45fGYqkAqh7P0XLj2eswzz8RgSRJ0c9s,951 +pyarrow/include/arrow/vendored/double-conversion/bignum-dtoa.h,sha256=imGhcg0RywMsFNMYTqp6rlXw2HZCIAla8SC_n92gCqE,4358 +pyarrow/include/arrow/vendored/double-conversion/bignum.h,sha256=RnQ2CPL8Pt6fVCGh_8VDF11e_GyrrwO0IH0uMnTcsEs,5949 +pyarrow/include/arrow/vendored/double-conversion/cached-powers.h,sha256=jjwfR3bue7mNlE5lbTrFR2KlgjRew2OkmjBa7oQO0Qg,3079 +pyarrow/include/arrow/vendored/double-conversion/diy-fp.h,sha256=J-RgqH27jspT5Ubth9pTA9NAZH6e7n1OVhxModgi8Sc,5088 +pyarrow/include/arrow/vendored/double-conversion/double-conversion.h,sha256=J1Tl5-8aFY0A9SnaA9z5Q90jnMxw55illPIuE-jdD5Q,1804 +pyarrow/include/arrow/vendored/double-conversion/double-to-string.h,sha256=C-tKRi0IuLycXgS6CC1oiFkCroOo_-AO0VOjmfe0tlE,23925 +pyarrow/include/arrow/vendored/double-conversion/fast-dtoa.h,sha256=ZAho25fqeP3t2RM0XgqfhTBXQIIicACLpdyHHMRX3JU,4122 +pyarrow/include/arrow/vendored/double-conversion/fixed-dtoa.h,sha256=HLnpxkHjKldm-FBiDRbADYljJBSYbQGP4Gz-sVbiSJU,2828 +pyarrow/include/arrow/vendored/double-conversion/ieee.h,sha256=CVKA9RXSjv4ZygqDHMiF-H2hUh3QHQvp1GZYC3MAhkE,15281 +pyarrow/include/arrow/vendored/double-conversion/string-to-double.h,sha256=Ul6b-2R0pjUaAWNM3Ki4kH933LqrW6_XfPz4BSiE2v8,10906 +pyarrow/include/arrow/vendored/double-conversion/strtod.h,sha256=6xCRm47vmcghYJug5mhhTVbsZ3m3Y6tQfMehEyVZNx0,3096 +pyarrow/include/arrow/vendored/double-conversion/utils.h,sha256=wFRb5cGABiNoUSCnvKmdv_KIMcBtX1PX89tPFfvgbQI,15614 +pyarrow/include/arrow/vendored/pcg/pcg_extras.hpp,sha256=FEYzq8NFxPfdJyLs4kVtTBLkaD6iO71INz9EJnaxTdc,19784 +pyarrow/include/arrow/vendored/pcg/pcg_random.hpp,sha256=7TaV3nZhcwpf6XxlZ6cod1GaW5gm-iUn67t2fiMPNbA,73501 +pyarrow/include/arrow/vendored/pcg/pcg_uint128.hpp,sha256=r8exMtH21S8pjizyZZvP8Q8lAdxkKF22ZEiurSTFtzM,28411 +pyarrow/include/arrow/vendored/portable-snippets/debug-trap.h,sha256=9KphJ9gRtDT9DXR9iZ7aS23xa2T8tLmLsFEJMg0pLDQ,3081 +pyarrow/include/arrow/vendored/portable-snippets/safe-math.h,sha256=q9yWh34bsFu1vSqLTuI3n_cIU4TlY98Lk1elxKHvZP0,48167 +pyarrow/include/arrow/vendored/strptime.h,sha256=q1IZi5CvyUp_PNzbQ4_XLroAV24VEovBEz2TkpwUJ9c,1212 +pyarrow/include/arrow/vendored/xxhash.h,sha256=MUwtyzu7xjkx9mBcS65SaDcCK7tgeqQgj-KYEMxcHWc,844 +pyarrow/include/arrow/vendored/xxhash/xxhash.h,sha256=videnbIaUDw38kaDzbSQjyNwo-NauW4CxOpz3I45nEM,253096 +pyarrow/include/arrow/visit_array_inline.h,sha256=XuQjuME8XZeJp7W86YuCsuoVVgmG1NulXAA0KJkmmB0,2446 +pyarrow/include/arrow/visit_data_inline.h,sha256=4MkdFVsrjhMyTDNrScQtOYV_nwzqR2ddSS2yYnbyLt0,12460 +pyarrow/include/arrow/visit_scalar_inline.h,sha256=KvNY0j8nE9gs_805LXMV3ATgvxvUqW4UeKpXUxR3rMA,2419 +pyarrow/include/arrow/visit_type_inline.h,sha256=45aoF8APn8hm909nLBngls669o2yKCn24WlL5XdDpa4,4397 +pyarrow/include/arrow/visitor.h,sha256=NKos98j54uY9tdXzctI_n_nwFRrXNOwanxLDqDZONw4,8690 +pyarrow/include/arrow/visitor_generate.h,sha256=n2YKZW-5hY7ICQSwEUBZIYh2eg9ZoTfD54XRd9OlNDo,3324 +pyarrow/include/parquet/api/io.h,sha256=Ricq0d2R4QXHiGZCbjxZ_0F_QmKq0IrfTidNu5NoXPI,847 +pyarrow/include/parquet/api/reader.h,sha256=vnM5XDPn1TVsDJk4SDgb3ZU2Ta4vdrRzCpDWO90rYHk,1204 +pyarrow/include/parquet/api/schema.h,sha256=KsNJ529pEh7bGUa0rLUCcfanI9rW2uSTirgpvKq0hdc,855 +pyarrow/include/parquet/api/writer.h,sha256=UJZbY8QGVRMtAmozzjoM9TnI4gssqlNFUKCXBw2IfuI,1007 +pyarrow/include/parquet/arrow/reader.h,sha256=l4R351BVOWpYJOv_vyqWmXdJUErm2z_ztvTAv537q0w,15305 +pyarrow/include/parquet/arrow/schema.h,sha256=Mi56ul7itNS6NDbMpKOJCufjHVqaSY5_rbsNRNLE560,6204 +pyarrow/include/parquet/arrow/test_util.h,sha256=Edb5eSSEwkIExpHZ9Q0LJgPzggWNry4WMQ_i4q9z1uo,20540 +pyarrow/include/parquet/arrow/writer.h,sha256=XicHPFeGb92AcsNRDblJ7V4Hmst2qSPGYYT9MTSNNmI,7095 +pyarrow/include/parquet/benchmark_util.h,sha256=RhFvoDBVyfd5Sv0fm9JO4JrXWJRGYYmIIrHXi0cSJP0,1756 +pyarrow/include/parquet/bloom_filter.h,sha256=TC3OxK0J2v6tHxT_Bbw7mlYtM0603KXgBoHRvmzM9aA,14999 +pyarrow/include/parquet/bloom_filter_reader.h,sha256=63kpHYKs5TPrbRamkBLZsDYbD-I9UeVhF-R8d7JHeLg,2892 +pyarrow/include/parquet/column_page.h,sha256=_BbPcMfSa52JmteUMdsc7BW6KWoGXn9aQepDgr0veSE,6526 +pyarrow/include/parquet/column_reader.h,sha256=3QwlHlpiS5e5jtWmI_kRmD4jrrC8ljfpqF0ilf5JgNI,19299 +pyarrow/include/parquet/column_scanner.h,sha256=HecBvh-z0n_1HJsD-GIdcGHQAvDOHKlLzppB9RBsD9s,8863 +pyarrow/include/parquet/column_writer.h,sha256=Y9VN1eJtsYmQVhpL9UPiWGrHbgSDbDds19Z1nv_yfOA,12294 +pyarrow/include/parquet/encoding.h,sha256=jSYqNVLnsKFu95Mb3uhTP06-7La5_6kNJwn00VqSK_Q,16341 +pyarrow/include/parquet/encryption/crypto_factory.h,sha256=RT4iznr6uvSIPbUzh_7s6Cexe8uMbQkzgrjCTGYBC6I,7057 +pyarrow/include/parquet/encryption/encryption.h,sha256=bHJ7USckzezXfydqjJstljcjuR15r8U6zh8z3IoINCo,19842 +pyarrow/include/parquet/encryption/file_key_material_store.h,sha256=YzAVO3M2H5v5Fz2b_WlmB3GE5wVbMEnFTL3S9XPH6k0,2200 +pyarrow/include/parquet/encryption/file_key_unwrapper.h,sha256=pB30St8lGEaEAxNcwnDnlGtATTvc1muMzNOusfgqzT8,4635 +pyarrow/include/parquet/encryption/file_key_wrapper.h,sha256=d2W4xICbSRAy7aPe5RKahhPhiJDfvxHY_v_lifq7wqY,3762 +pyarrow/include/parquet/encryption/file_system_key_material_store.h,sha256=9H1ey0O3LL4dg9VVeFLNxlZ7Vr263JVaZHKVSu4s8MI,3573 +pyarrow/include/parquet/encryption/key_encryption_key.h,sha256=0c3ZrRud2vrCu5z513ocyPYxlsP2kg1fQ8m0Jqr701g,2232 +pyarrow/include/parquet/encryption/key_material.h,sha256=kPTSIuRFYOnH4BCPIB33zG9hp5D2Ba-5kZVlq3rFnRI,6221 +pyarrow/include/parquet/encryption/key_metadata.h,sha256=Pc0nA9LW3Fc9NLMMxz7osbw8si2jSiOVTES-J-9R0y0,4003 +pyarrow/include/parquet/encryption/key_toolkit.h,sha256=HPabI8qFnIMgxZYhHgXCzYV0LU1c5yJ16xjUx21I9b0,4577 +pyarrow/include/parquet/encryption/kms_client.h,sha256=D34pVHzkCbWqKnPIBYfs6cONxmuYzyLSS9-C52ZFhz0,3151 +pyarrow/include/parquet/encryption/kms_client_factory.h,sha256=VZ97CMgDQxx5oZWFGprjXsaM1hZ0wNudPmFU1_lniAc,1293 +pyarrow/include/parquet/encryption/local_wrap_kms_client.h,sha256=XZxkEct0-Tv93VDpda9sDou1kp9qkTKMxr36bpVcI8s,3954 +pyarrow/include/parquet/encryption/test_encryption_util.h,sha256=zIGeULeTOCU1N-XYHdvIppth5wnnTYEwf2h-OuTcQZQ,5209 +pyarrow/include/parquet/encryption/test_in_memory_kms.h,sha256=jYc5WPsrh_wcaaaWcjf23Gbiye3a_bdg2royUfukWEs,3521 +pyarrow/include/parquet/encryption/two_level_cache_with_expiration.h,sha256=cuHbX9gBWWyd0IPXNVjMmHxjPw7omYTns4If4YhBgSM,5075 +pyarrow/include/parquet/encryption/type_fwd.h,sha256=dL8snyUwNjhTQE2FQ2dXAUjTboEXhH2JOehQovHfixc,955 +pyarrow/include/parquet/exception.h,sha256=yc5A3iMqM9P59hnjuY8VXUIoF_JvbZVPHM6_wPtg4cI,5599 +pyarrow/include/parquet/file_reader.h,sha256=OFRKhwAww2N24aZOZcznzral1Or1TGIFGRd1aACARLQ,9664 +pyarrow/include/parquet/file_writer.h,sha256=6fK6Mn-MdiQ-J4oo8BTi_eVVVshlffoQiJzFaLRrqco,9343 +pyarrow/include/parquet/hasher.h,sha256=HSY1EjPD2xx_dB9HtAg-lXL7hB4j9MDE0cAlR7u0NOc,5227 +pyarrow/include/parquet/level_comparison.h,sha256=5z4fUJJPWq9W60l2CsAI7T7E2auGYD7m0fpR5rfLmsw,1306 +pyarrow/include/parquet/level_comparison_inc.h,sha256=r20_6Rv5L7UmFGJ68f-JaZ5hLXb87wvZa80hZNQoF-I,2494 +pyarrow/include/parquet/level_conversion.h,sha256=OsuqK1xiUnEnOLPKwfm9X-pXTaXRMlDIkj3lwGb2ggI,9432 +pyarrow/include/parquet/level_conversion_inc.h,sha256=0r2Gfd_FMidLGFC_a8kgpC9bnUt2-IBbAn9QbQFTrTo,14161 +pyarrow/include/parquet/metadata.h,sha256=ORXKWkfSM-64vTrZ-qrsQ5naKx_pk8XbjJEPwtct7wI,20751 +pyarrow/include/parquet/page_index.h,sha256=qBKqiq131jCUrtFCfwlBkeb8PL96yOPKg7AqkslnM60,16399 +pyarrow/include/parquet/parquet_version.h,sha256=JdG5J1FtQGc7YYUvn2ef2iHtTiwORave4RU6pm3dl9g,1173 +pyarrow/include/parquet/pch.h,sha256=zIdkjZS4kuFYra3woGMjmvYXCwB4IaXdpm_nR5Nz8hk,1249 +pyarrow/include/parquet/platform.h,sha256=VS0zEUC4d37LQmlQLQZ5aHNaiwRf8QrxixXdWf73m5Q,3898 +pyarrow/include/parquet/printer.h,sha256=_sJ5IoEj4naSTWxlhbq2Pc6WkNG3wMuxRy8zfKfsAJ8,1540 +pyarrow/include/parquet/properties.h,sha256=X5zn-xdztONv4QfK-gcfdh1CBAuq27cVj9jZQgQNqfA,46415 +pyarrow/include/parquet/schema.h,sha256=CjZh2i9WN5VeoDbLqy7M1AZtopZ43_C9blWG3OT2IfU,18222 +pyarrow/include/parquet/statistics.h,sha256=0sk7koXslu-KuVC6CsTiFVD1Fu_ZWPD_FLhcXALas_g,15176 +pyarrow/include/parquet/stream_reader.h,sha256=1WmN0vYCqTz1Lwb_Di4xPWTE-VbCQQuzZralSpWQm3U,8791 +pyarrow/include/parquet/stream_writer.h,sha256=nw_v3nhrL682ozZ2KZKVkHnOsjwexbmBXTV2CKcq4YQ,7505 +pyarrow/include/parquet/test_util.h,sha256=gkJoOl_N4cG3L56uXVJi1RLiDVBl73yX01Dkx2Plt9g,31180 +pyarrow/include/parquet/type_fwd.h,sha256=qx6Dhg1HO0U99jdiUfu3rC7zhmQ-3i7WXsfEhrza3rE,3046 +pyarrow/include/parquet/types.h,sha256=IFbKlP0aZzW8Cn4U0QCIGboVb8hOnD6UvSGi6EqpvvE,25482 +pyarrow/include/parquet/windows_compatibility.h,sha256=xIEGHW354URgdIP9A4V303TJL8A1IkCEvp08bMKsHTU,897 +pyarrow/include/parquet/windows_fixup.h,sha256=DpyWCywx8YIqouun6BJcgMrHFMTCBgowWdJ1mnJnQ2s,1052 +pyarrow/include/parquet/xxhasher.h,sha256=QAa7ZE7S3UFtU_Voz3oi3YclIYhbhviJkafLOYgiuWg,2074 +pyarrow/includes/__init__.pxd,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pyarrow/includes/common.pxd,sha256=tYI1M3gk_d-uzNUpLcIxhNG5W67ycFSVb36Tv7hyN30,5452 +pyarrow/includes/libarrow.pxd,sha256=zOwU6egOTsU45dV0z0sEVLMu6N2iNXXxNB4R3R2QlyA,114590 +pyarrow/includes/libarrow_acero.pxd,sha256=c84RdYfIuFWW_36-1RELJsowfQwXhgUxbdC_xKQyFCI,5298 +pyarrow/includes/libarrow_cuda.pxd,sha256=0fRcHbCZY_gFdwIXIElnpGvTxeA5xVxZH1-vwZh16SM,4942 +pyarrow/includes/libarrow_dataset.pxd,sha256=LVHtNouC3ZWMmyD48JkYGXajf22Wax-FgzAV4URqySs,16993 +pyarrow/includes/libarrow_dataset_parquet.pxd,sha256=4me_u82JiInHNRvoazLXUTOO5sxVnyCk-BdfsYQZyWQ,4536 +pyarrow/includes/libarrow_feather.pxd,sha256=MTJUDQbfKP8Ir700Fobl7xcbjX7WcrsUV4mxFXlfwn0,2140 +pyarrow/includes/libarrow_flight.pxd,sha256=pcVtpB4Rx81RZoG3afIizmyQuTnckrqIPZyjvsIYYKE,24860 +pyarrow/includes/libarrow_fs.pxd,sha256=jG1sBGyTkU3X_XZKBMC-n3YsY8Po_2dIQdXyK9vXtHY,14973 +pyarrow/includes/libarrow_python.pxd,sha256=Fs9hNJZ-_fdVmqkNu3zGRUXy8Azt6_zniX_p1SKqM64,12387 +pyarrow/includes/libarrow_substrait.pxd,sha256=5ZJ0yHhM54I1GfmUaPMy5nRxLFsr-A625qUSmOhnQO8,3196 +pyarrow/includes/libgandiva.pxd,sha256=FLBd99IeU67Db9SnHS7oe6FgBZ1aIHuRc0pOiDv7hQc,11538 +pyarrow/includes/libparquet_encryption.pxd,sha256=fi3QrLpHN1_IaYRXvVMJdIgp7F_6aaLu1owP0I3BD5g,5898 +pyarrow/interchange/__init__.py,sha256=DH0bwbKpdjD1WCW1VinnXEuVLY098uHKkirv7DFc9JM,845 +pyarrow/interchange/__pycache__/__init__.cpython-312.pyc,, +pyarrow/interchange/__pycache__/buffer.cpython-312.pyc,, +pyarrow/interchange/__pycache__/column.cpython-312.pyc,, +pyarrow/interchange/__pycache__/dataframe.cpython-312.pyc,, +pyarrow/interchange/__pycache__/from_dataframe.cpython-312.pyc,, +pyarrow/interchange/buffer.py,sha256=NF_GU1uQ6INqHqCwzY6XQQqRxKDh6znEeDHiRqaEIQ0,3359 +pyarrow/interchange/column.py,sha256=afU794n3H7yf4gDQDuFLbtyDlgVnLk9iZ6sugb0h8_4,19370 +pyarrow/interchange/dataframe.py,sha256=tmSMmBvBAc-ZSUzE8tBNbvQLHuuxLuBkMkK6KYwtS8M,8405 +pyarrow/interchange/from_dataframe.py,sha256=JfkP4wuY_9x76H6RDtmsOzs6B6qe-1WS7zxpKeD481s,19709 +pyarrow/io.pxi,sha256=LcEqNanwQD7dr0XVHu52dnhlUUH25bhjDGGPO6Wet34,86616 +pyarrow/ipc.pxi,sha256=Reakb_rHbBipOr9QPEC0D2jBvQ87ORpVb5kasDdeY_4,41081 +pyarrow/ipc.py,sha256=Hb3qCPKRr_wth5u4WrkZHJyAZmIK5SoVSezfBOI97Ww,10107 +pyarrow/json.py,sha256=N9Y7_3TSrOEDy2OrmgQ8UKqUPMx1Bm9dYgot-brJ8Xw,858 +pyarrow/jvm.py,sha256=tzAsIrMSCIeNAtSC8lZWjQS0rq7kjaQDPlePDmvpqDw,9593 +pyarrow/lib.cpython-312-x86_64-linux-gnu.so,sha256=rFmfK84y_4cgB1AVpEJ6i_ccRvpFZdnxKz57L-amYNE,4841880 +pyarrow/lib.h,sha256=UNSuhntc2NTo9y8txHS8MqB10IQN41UuXjb5dGtstfw,4631 +pyarrow/lib.pxd,sha256=repMfzMLwO9NOjVyJbVn5R_vBQJJhD507YcD1wvaB8g,17964 +pyarrow/lib.pyx,sha256=Pe9ERxojd9KzxzqWJ60B8OJHH8Z1fFYg3bUx8ZDFUtk,6016 +pyarrow/lib_api.h,sha256=SCXALS0e94-_uXt9ZlqlUlvU-cclpx7xT8LpxAU1nbM,19487 +pyarrow/libarrow.so.1800,sha256=Y7RK32bZUhc4QywHUa2EQ5SoVQykN0K1Du5bQ6XPSm4,63802656 +pyarrow/libarrow_acero.so.1800,sha256=BW4UgV6qjSyN0wgsLcaL3JbrmThjkkGrCBmPSqWA3_Q,2076808 +pyarrow/libarrow_dataset.so.1800,sha256=mtdI7IoiqH7b-Vw0Slwf7nzW-XqHvPJEcbBW4nBnuzE,2758368 +pyarrow/libarrow_flight.so.1800,sha256=DGZSDNI0LHv4KSvo-jZq7v4Mr16frKl7OwxORT367VE,20272608 +pyarrow/libarrow_python.so,sha256=cVn2eq555bGR1cCc8JG0C1sdxJ7wI_qMOoxP5X3ptVs,2876328 +pyarrow/libarrow_python_flight.so,sha256=Xpfx0Dh16mnGh3pVPyrP2np39AuHr6184HXoXcIqiT0,117984 +pyarrow/libarrow_python_parquet_encryption.so,sha256=GLMFpnC32YeCz_SBA9wvaIbjRbp5gFf8fPOwxZ8sKdk,41648 +pyarrow/libarrow_substrait.so.1800,sha256=KpWIu7btYy7JPkJoQl93zXwmoUjWqDv6ssnjfuPeCnM,5338320 +pyarrow/libparquet.so.1800,sha256=BooCSAI83odo6uJzxfVBz7tPjokwOdyOspYGNNu3gBw,11040840 +pyarrow/memory.pxi,sha256=9AVMENxqaV0Ndf9tYSiakunEpMRRCZNT9d-PnrY8r14,8229 +pyarrow/orc.py,sha256=IjjeGAEZl0KhHvwy3YsSGfTWlx7Ilb54P0tFKPvwcfk,12618 +pyarrow/pandas-shim.pxi,sha256=d3Z0mki6n3QUTzCOJoEhvgUBcCIcWPsuBli65ZQ_gBg,8178 +pyarrow/pandas_compat.py,sha256=sMLsO2ufQeRxpZadNHv4AEG2FGP8EstyOglL38sqAeA,42775 +pyarrow/parquet/__init__.py,sha256=4W64CbvwvO60tG58nfNtyCwMVCfuPumtu82p-kiGPaE,822 +pyarrow/parquet/__pycache__/__init__.cpython-312.pyc,, +pyarrow/parquet/__pycache__/core.cpython-312.pyc,, +pyarrow/parquet/__pycache__/encryption.cpython-312.pyc,, +pyarrow/parquet/core.py,sha256=SA1zMIm-0cnTPMCjgWe_Bu6bFbjBbTWBpfYauGcHpW8,90440 +pyarrow/parquet/encryption.py,sha256=-XW7Qcbl-jQhpZsR610uQ8-z9ZVE_NL045Jdnp1TZ9M,1153 +pyarrow/public-api.pxi,sha256=EO0_0FZz0JK9_SfuHBPN0ljwwAU7Gv6jGl1WG_BSGsE,13781 +pyarrow/scalar.pxi,sha256=hRcUS1nHQILBp8eL3vfhRXp4yXrvVRPBBoD8ALVdhZ8,35388 +pyarrow/src/arrow/python/CMakeLists.txt,sha256=D4Ypror_508aAd_juYkrS9Qu2maeirK4QXzwGEZEj0M,855 +pyarrow/src/arrow/python/api.h,sha256=W76VAxYqOxi9BHJddji1B62CmaWDFuBhqI65YOhUnGQ,1222 +pyarrow/src/arrow/python/arrow_to_pandas.cc,sha256=z22z8UmNl69KGbmbZLwgZhApNyD9x7xolCSC_3_g6oE,95737 +pyarrow/src/arrow/python/arrow_to_pandas.h,sha256=jUBEUMKXw70oJdMlgkSf6HitaNweQcc7hxI75_C9WSI,5561 +pyarrow/src/arrow/python/arrow_to_python_internal.h,sha256=nQXPZTL3xa4Sm-a-Gv-8bpFs-qAOZHkqWmA_m-dSLVw,1740 +pyarrow/src/arrow/python/async.h,sha256=C0f8YYmgwBGgDau4xEFsdjukiZB4YvpylETHEZryHOo,2352 +pyarrow/src/arrow/python/benchmark.cc,sha256=z6qYRx4qMuNXPaC8fuPJlQd92aosMN85u1aD50R1-UU,1293 +pyarrow/src/arrow/python/benchmark.h,sha256=f-kzyMOlPKDse2bcLWhyMrDEMZrG_JHAPpDJgGW0bXU,1192 +pyarrow/src/arrow/python/common.cc,sha256=_9ozIRo_WTDWovBKqOVyX28d0IttHvwW9MG-PkTzmKc,7591 +pyarrow/src/arrow/python/common.h,sha256=yjljfJK1f7slZ7DBQ4LTo_pob70zioswJNWazy0p-uM,14412 +pyarrow/src/arrow/python/csv.cc,sha256=ql5AY76AqiFksWsrmzSl551k5s9vS8YcmypM2A9rhw8,1803 +pyarrow/src/arrow/python/csv.h,sha256=QxU3B-Hv_RsoEcMGS9-1434ugouL2ygC64Lq6FgviNM,1397 +pyarrow/src/arrow/python/datetime.cc,sha256=_VKRKeyFqR7Xzay2wazcveb7mgOv8K37ebMomNY__lQ,23001 +pyarrow/src/arrow/python/datetime.h,sha256=Bny_THGi2tyUeHxcOuw01O7hNE8B_gave5ABAZQtwTQ,7931 +pyarrow/src/arrow/python/decimal.cc,sha256=66Hy-u-_fcZtm_0v7npDtPNoiX-mkRJTwCj3FpSyIqc,8848 +pyarrow/src/arrow/python/decimal.h,sha256=kDDjLzW07D7d7omWSR4CBF1Ocskp4YSZu4Dtxu-gRUg,4726 +pyarrow/src/arrow/python/deserialize.cc,sha256=ogtBX7OzGuDvyj_LepFkaG7m53-wenf3duG0WF8Ooa0,19185 +pyarrow/src/arrow/python/deserialize.h,sha256=Q4L1qPCra8-Wzl6oLm44cPOUMVuK1FX01LeGzwNUtK4,4260 +pyarrow/src/arrow/python/extension_type.cc,sha256=eU5P7pufWjcEcmVeOyu1jtEZ08AWd9tkTSMfx8ph0rQ,6860 +pyarrow/src/arrow/python/extension_type.h,sha256=0gzb42y_mbw4fsYs3u8cwPFLBRlG-kkHQLgbvGtrY0U,3181 +pyarrow/src/arrow/python/filesystem.cc,sha256=0twavI91TE20Otq5kkVUwnN5sindU_mBWoVAvz1ZMgI,6152 +pyarrow/src/arrow/python/filesystem.h,sha256=FG0AcLekqaDf9IQPqKixAfIcY_ZLgIKP5NvvXdtBVUM,5126 +pyarrow/src/arrow/python/flight.cc,sha256=Iz4wAyhX7mksabELtRljCOsXRRzuYzu38Rv_yQKJarw,13995 +pyarrow/src/arrow/python/flight.h,sha256=u5UnulNJqMuXQLlODUWuoyxq-GtL1HuHmVGNzobUVGc,14311 +pyarrow/src/arrow/python/gdb.cc,sha256=Z0WLBYHWBzc4uExNG7nWJeRnUBAVSqo_DFpKYry0aAE,22667 +pyarrow/src/arrow/python/gdb.h,sha256=H-qvM-nU8a_3Z5tk8PvppTwQtBMSZhQKQIVgRAsRfFg,972 +pyarrow/src/arrow/python/helpers.cc,sha256=zrrUI56RGrZ8VBzR2dJFJoRq7L6chlX7289HK7tjoOA,16627 +pyarrow/src/arrow/python/helpers.h,sha256=jVNFEbvJXmCceJti3J3-MnZkNlJoynQNq334tt29bbs,5489 +pyarrow/src/arrow/python/inference.cc,sha256=Gm-lOXDzqcbef6gdgCQa5eXPuh8uvYqz9iUjKS2_yO4,24350 +pyarrow/src/arrow/python/inference.h,sha256=FUFvB4Zy7V-tueXdmbDcqTeLK4xj5GZEeRW5yhiJlsU,2038 +pyarrow/src/arrow/python/io.cc,sha256=ZARQCv4WQmHDQrA1dlNZt6mJuPhyK8wNuGm7zoL6V78,11936 +pyarrow/src/arrow/python/io.h,sha256=4jGnodpSUlnVqAVh9fWId7H4WldlLPkXyroABpdaW6w,3858 +pyarrow/src/arrow/python/ipc.cc,sha256=3D9iMbOFHlhNXX4432wsfbfjWvDryZWgdA0Ak19V_8Q,4472 +pyarrow/src/arrow/python/ipc.h,sha256=SZbw6jCCqLiLNCY3k632GmwHeD_r_xrDS0dhqV49VhY,2259 +pyarrow/src/arrow/python/iterators.h,sha256=Ugfm3JvetAH0l-oAjjpZfhrUBqRimVMaw4-xusvqLSg,7327 +pyarrow/src/arrow/python/numpy_convert.cc,sha256=166BIW7zVTRMKogxLUuhV4e5jOevmonvRtXDydNujgc,21194 +pyarrow/src/arrow/python/numpy_convert.h,sha256=y13eHwfe1lJKzadoTr2-GyX6xPsE6Z7FN31s7PN-2Rk,4870 +pyarrow/src/arrow/python/numpy_init.cc,sha256=cJKOH946T7VCcB-gVIoGgfbWTrbj3FPkI4TgnsLTf7s,1178 +pyarrow/src/arrow/python/numpy_init.h,sha256=FniVHP7W2YBlenoMYhQrODvoqqvDMSls2JANGtNPQts,999 +pyarrow/src/arrow/python/numpy_internal.h,sha256=F9p-hzTKCIhRqgtZbsoyPox7RR85YcEw6FYkFF8KqfM,5314 +pyarrow/src/arrow/python/numpy_interop.h,sha256=rI6ek8JTOYtjo7gEADSDBS6QuAOHa2A0YQPZ2GeypFw,3418 +pyarrow/src/arrow/python/numpy_to_arrow.cc,sha256=55-VSQlg10MAZTR0G3I7maErexO8-FDk_27SYdvVlk8,30238 +pyarrow/src/arrow/python/numpy_to_arrow.h,sha256=z9KapsuoOSpWILPt9bea7GR4BL6AQ28T6DUO0mSkh3k,2760 +pyarrow/src/arrow/python/parquet_encryption.cc,sha256=RNupwaySaVHKX_iCYOPK0yJWkTUpqbrpbCW2duWJ3kU,3567 +pyarrow/src/arrow/python/parquet_encryption.h,sha256=Mc8tZ8gIfkH0AckNiIOt6hesP_MVKeKhcytT24ZOLdQ,4861 +pyarrow/src/arrow/python/pch.h,sha256=vkbgStQjq820YeHlXBPdzQ-W9LyzJrTGfMBpnMMqahk,1129 +pyarrow/src/arrow/python/platform.h,sha256=XYS5IqiMUejxN2COzu60Zs8b_wAaGTBw4M-zKVqqs5U,1422 +pyarrow/src/arrow/python/pyarrow.cc,sha256=Pul4lmF7n5Q9cSzgBSvPArWfZY_qDyAq1a_tyMIQGRA,3677 +pyarrow/src/arrow/python/pyarrow.h,sha256=TK3BtD9n3QKOQ9dX3LXbQc0hu9alWcufV0O93iQW7B0,2761 +pyarrow/src/arrow/python/pyarrow_api.h,sha256=7l0G4-_m9yALYoifsY8Z6qh3HHD0PgkpVSgCn_JaGU4,867 +pyarrow/src/arrow/python/pyarrow_lib.h,sha256=-70_Ckj3_0ImlzaXSJOE_d3w9pGM66lXiGPyln9c96Y,863 +pyarrow/src/arrow/python/python_test.cc,sha256=Jg35rRR7BtXOS1012RFOdLViFlVC3zlXV--w8aEzf8I,32397 +pyarrow/src/arrow/python/python_test.h,sha256=ea32mM20uHySlygi9MtVxr26O-ydTZHCUQIlxaIMjT4,1195 +pyarrow/src/arrow/python/python_to_arrow.cc,sha256=K6tVQK1phrrJQzz_TJVmEdfcX-fJfBAkPIeQlypRirY,47145 +pyarrow/src/arrow/python/python_to_arrow.h,sha256=BoVytf6P7PBYXyznchElKZSFvEsFyimB-tLFdw0AUNo,2521 +pyarrow/src/arrow/python/serialize.cc,sha256=FOAsdyfRETe_bCSxC1vc3-oq9Rs9SsU4kDQFTwrdvQM,32667 +pyarrow/src/arrow/python/serialize.h,sha256=HVBhIKgc7A4YOmwYfjE2Hqj1Yxl9suCJb940KxrVcrs,4630 +pyarrow/src/arrow/python/type_traits.h,sha256=B_NsRT_hZG8D91sTcihJyKF5SrslPcFmj12QfbpHuLI,10093 +pyarrow/src/arrow/python/udf.cc,sha256=69DuHRjV6rUAbZqkWEKEUG3ODuHg9ym52lnH7A_lM5Y,29814 +pyarrow/src/arrow/python/udf.h,sha256=de3R8PhNJO5lT9oCqRxe8e2_SE3jBpHOkwbNqCrlgjQ,3104 +pyarrow/src/arrow/python/vendored/CMakeLists.txt,sha256=02XvDJAdKiajCEBOmnMKBpmzbRU7FPkNdlNXtw0-A24,837 +pyarrow/src/arrow/python/vendored/pythoncapi_compat.h,sha256=bzMnlHTCfjk5DQRIxwytunYh5aQxU3iSElaaDyNnAY8,40900 +pyarrow/src/arrow/python/visibility.h,sha256=hwJw5sGrWJckQkNaAuLe4Tf-VDjQbXknyzNOVgZI3FI,1381 +pyarrow/substrait.py,sha256=ugd_UrjkUIrwSvqFxLl9WkVtBZ2-hcgt5XiSVYvDLnQ,1151 +pyarrow/table.pxi,sha256=Dfujf9nDQ9R--F5cybcUxB126Hu1mBuARWvgOTuFl3o,203868 +pyarrow/tensor.pxi,sha256=CXlMcTRWh_n_FTzIIx8SpHCmYlV0IBA69toQ-3Evs5o,42071 +pyarrow/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pyarrow/tests/__pycache__/__init__.cpython-312.pyc,, +pyarrow/tests/__pycache__/arrow_16597.cpython-312.pyc,, +pyarrow/tests/__pycache__/arrow_39313.cpython-312.pyc,, +pyarrow/tests/__pycache__/arrow_7980.cpython-312.pyc,, +pyarrow/tests/__pycache__/conftest.cpython-312.pyc,, +pyarrow/tests/__pycache__/pandas_examples.cpython-312.pyc,, +pyarrow/tests/__pycache__/pandas_threaded_import.cpython-312.pyc,, +pyarrow/tests/__pycache__/read_record_batch.cpython-312.pyc,, +pyarrow/tests/__pycache__/strategies.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_acero.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_adhoc_memory_leak.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_array.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_builder.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_cffi.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_compute.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_convert_builtin.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_cpp_internals.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_csv.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_cuda.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_cuda_numba_interop.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_cython.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_dataset.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_dataset_encryption.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_deprecations.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_device.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_dlpack.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_exec_plan.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_extension_type.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_feather.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_flight.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_flight_async.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_fs.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_gandiva.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_gdb.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_io.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_ipc.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_json.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_jvm.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_memory.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_misc.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_orc.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_pandas.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_scalars.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_schema.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_sparse_tensor.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_strategies.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_substrait.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_table.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_tensor.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_types.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_udf.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_util.cpython-312.pyc,, +pyarrow/tests/__pycache__/test_without_numpy.cpython-312.pyc,, +pyarrow/tests/__pycache__/util.cpython-312.pyc,, +pyarrow/tests/__pycache__/wsgi_examples.cpython-312.pyc,, +pyarrow/tests/arrow_16597.py,sha256=DNb41h9E3ITGvAJJu86i5SfsKrwstQJ0E5gT_bpTS_k,1354 +pyarrow/tests/arrow_39313.py,sha256=0pyBixoX38fldTPO1Vwshi_H0XBACrz8esYoL4o71KI,1431 +pyarrow/tests/arrow_7980.py,sha256=tZKb_tRLfxHaosDk9Yu2GLEsJjMaruXD5CKhbK_6Hq8,1094 +pyarrow/tests/bound_function_visit_strings.pyx,sha256=vDEFoNYR8BWNkCntKDuBUT8sXNRBex_5G2bFKogr1Bs,2026 +pyarrow/tests/conftest.py,sha256=PwqCO9vIgMUc2W9gCwcDvEz4hcp2eIYHDZ_fwddhqJ4,9904 +pyarrow/tests/data/feather/v0.17.0.version.2-compression.lz4.feather,sha256=qzcc7Bo4OWBXYsyyKdDJwdTRstMqB1Zz0GiGYtndBnE,594 +pyarrow/tests/data/orc/README.md,sha256=_4X5XszZqQtWAVEz5N1Va4VyyayGQgNDKrcdMX2Ib4s,932 +pyarrow/tests/data/orc/TestOrcFile.emptyFile.jsn.gz,sha256=xLjAXd-3scx3DCyeAsmxTO3dv1cj9KRvYopKe5rQNiI,50 +pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc,sha256=zj0579dQBXhF7JuB-ZphkmQ81ybLo6Ca4zPV4HXoImY,523 +pyarrow/tests/data/orc/TestOrcFile.test1.jsn.gz,sha256=kLxmwMVHtfzpHqBztFjfY_PTCloaXpfHq9DDDszb8Wk,323 +pyarrow/tests/data/orc/TestOrcFile.test1.orc,sha256=A4JxgMCffTkz9-XT1QT1tg2TlYZRRz1g7iIMmqzovqA,1711 +pyarrow/tests/data/orc/TestOrcFile.testDate1900.jsn.gz,sha256=oWf7eBR3ZtOA91OTvdeQJYos1an56msGsJwhGOan3lo,182453 +pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc,sha256=nYsVYhUGGOL80gHj37si_vX0dh8QhIMSeU4sHjNideM,30941 +pyarrow/tests/data/orc/decimal.jsn.gz,sha256=kTEyYdPDAASFUX8Niyry5mRDF-Y-LsrhSAjbu453mvA,19313 +pyarrow/tests/data/orc/decimal.orc,sha256=W5cV2WdLy4OrSTnd_Qv5ntphG4TcB-MyG4UpRFwSxJY,16337 +pyarrow/tests/data/parquet/v0.7.1.all-named-index.parquet,sha256=YPGUXtw-TsOPbiNDieZHobNp3or7nHhAxJGjmIDAyqE,3948 +pyarrow/tests/data/parquet/v0.7.1.column-metadata-handling.parquet,sha256=7sebZgpfdcP37QksT3FhDL6vOA9gR6GBaq44NCVtOYw,2012 +pyarrow/tests/data/parquet/v0.7.1.parquet,sha256=vmdzhIzpBbmRkq3Gjww7KqurfSFNtQuSpSIDeQVmqys,4372 +pyarrow/tests/data/parquet/v0.7.1.some-named-index.parquet,sha256=VGgSjqihCRtdBxlUcfP5s3BSR7aUQKukW-bGgJLf_HY,4008 +pyarrow/tests/extensions.pyx,sha256=05S652zNGxwMFwuyMbP0RP4VNJLSMlzvoxH8iYIvSNk,3054 +pyarrow/tests/interchange/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785 +pyarrow/tests/interchange/__pycache__/__init__.cpython-312.pyc,, +pyarrow/tests/interchange/__pycache__/test_conversion.cpython-312.pyc,, +pyarrow/tests/interchange/__pycache__/test_interchange_spec.cpython-312.pyc,, +pyarrow/tests/interchange/test_conversion.py,sha256=23e5tpKBL-ekA5uWpM6-f6HVPF937Hnzfune0Ty9moo,18609 +pyarrow/tests/interchange/test_interchange_spec.py,sha256=5hwwCG6f7yf72PfUG0iLIk2bARsZU5EJeRjDxSQrkKI,9320 +pyarrow/tests/pandas_examples.py,sha256=RFKCW-Rn0Qz-ncd4pZWWSeUoPq63kemE3lFiVdv2dBs,5115 +pyarrow/tests/pandas_threaded_import.py,sha256=b_ubLr5dj4dWJht9552qc3S3Yt3fQQgaUH6208oZvHg,1429 +pyarrow/tests/parquet/__init__.py,sha256=dKsXU9M-sJyz2wYIuqwsKM9meOlK_qY6qhmQzIvEpCE,931 +pyarrow/tests/parquet/__pycache__/__init__.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/common.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/conftest.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/encryption.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_basic.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_compliant_nested_type.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_data_types.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_dataset.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_datetime.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_encryption.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_metadata.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_pandas.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_parquet_file.cpython-312.pyc,, +pyarrow/tests/parquet/__pycache__/test_parquet_writer.cpython-312.pyc,, +pyarrow/tests/parquet/common.py,sha256=-kckaOVj9P9BvL1vlvyHlsPUtysBoAYVL98Nwc9wmGo,5894 +pyarrow/tests/parquet/conftest.py,sha256=mJNQal0VYsGFhHglhSt-F9CYHy_i8hB8MXaq3SxFBvk,3082 +pyarrow/tests/parquet/encryption.py,sha256=Oi3QbixApvWGoGImiW7PAjR28cTQqlRXZKMI3O7E4UY,2521 +pyarrow/tests/parquet/test_basic.py,sha256=SdLPuZ02NaBUEtpr18SzA90I7pK-WRMM1r3ApbQI5ps,36492 +pyarrow/tests/parquet/test_compliant_nested_type.py,sha256=Lz7tCPrSpv9GrKPMS-eu1LehsCTwz7KdUdCYJ8tF8dE,3901 +pyarrow/tests/parquet/test_data_types.py,sha256=tdYodveHBksDjM7DjSc7x1IEqMAZv0y6z2GsnpDdriM,15778 +pyarrow/tests/parquet/test_dataset.py,sha256=UhjjQGO2kki9Q50zush0VGU3OMXHZncL_3uEQse4Lx8,42218 +pyarrow/tests/parquet/test_datetime.py,sha256=A3ZaRj88u0IrlhCNp2KY_A8txrb7y2pKPgEVvI7e7bU,16398 +pyarrow/tests/parquet/test_encryption.py,sha256=XMVlIcEurlzcPN2rlaNqbdZbGhF9hjz5ZhWY5Bz4Fxo,22099 +pyarrow/tests/parquet/test_metadata.py,sha256=0sbEUEEal4dEczJHk77KzCk3q9P_1JD61Ayw6HBXFzo,27158 +pyarrow/tests/parquet/test_pandas.py,sha256=dXXcaRBZXIt2HervJLC1gCxDLlhxu6MM_M3gxcaV1Rw,22821 +pyarrow/tests/parquet/test_parquet_file.py,sha256=xm5ZUCf5xmpKh7s5nTIrEiis53mfv2NqZWVRiYOTfAg,9909 +pyarrow/tests/parquet/test_parquet_writer.py,sha256=xwedRwRYtw5n_OMhLPGnJurcvlo4ooROsSalYL-ZVCM,11733 +pyarrow/tests/pyarrow_cython_example.pyx,sha256=fx6zT1bUb2-cDnwKoG71K3ozpmrNJ53kKQHHJTExGz8,2115 +pyarrow/tests/read_record_batch.py,sha256=9Y0X0h03hUXwOKZz7jBBZSwgIrjxT-FkWIw6pu38Frc,953 +pyarrow/tests/strategies.py,sha256=ygkKPSV8CM8IMU8uW8d_RuDZEbwyj8bhD0Bv-ZwvaRk,13926 +pyarrow/tests/test_acero.py,sha256=jgSkIAGhrVffShaD0ZAm50sY-f4u9jfjCimK8ezUbbA,15003 +pyarrow/tests/test_adhoc_memory_leak.py,sha256=Pn4PcIbOBRtSJuz9Ar_ocubco0QOMZ-eAE9Bs7Wp4mA,1453 +pyarrow/tests/test_array.py,sha256=p3JPYOvP6zJgNI2vuQ_ah9p5w126d9HRFeHN6Z5q894,139832 +pyarrow/tests/test_builder.py,sha256=zNEcslLwyb40oYbG7lInQcI81QHMKDOzi1zthw1Je7c,2803 +pyarrow/tests/test_cffi.py,sha256=Fbs1dFCxdnvXYLgO5oaxm_h8KV3vefE9jc3nI1JZNxw,26385 +pyarrow/tests/test_compute.py,sha256=ajHKKGCpw92ZgdJl2pfdVF1UW4xGQB3EPELxXt-CnNw,142525 +pyarrow/tests/test_convert_builtin.py,sha256=QTTX4KcmfZ5keLpSjfPnft9Eim4FeYnBpvPDwnOMGP0,80894 +pyarrow/tests/test_cpp_internals.py,sha256=Xg4CUB6zohQkcYG64Lj_Uf2BscI27Vv0JC_CqNkDQuE,2006 +pyarrow/tests/test_csv.py,sha256=GKNYAGis5TsiDJMIu0L6bH2_cIOpWDviRwxCfPN9Pv8,77313 +pyarrow/tests/test_cuda.py,sha256=qCbVbYOokzpEef-e0_Fus36xQR9Y---9MLCYquI3shE,36163 +pyarrow/tests/test_cuda_numba_interop.py,sha256=iHP_FE4sWbsKwNNXRcYnVozp3Wd1o0Mg6BDymx710G4,8794 +pyarrow/tests/test_cython.py,sha256=IJVELKXBD89xoCcxscMfUpwvkk9SL_kNT4cccLjDww4,7115 +pyarrow/tests/test_dataset.py,sha256=nbTfPH338ZqDstL1FuYpD7HefNMvbDi_zPF_zd4lFew,210420 +pyarrow/tests/test_dataset_encryption.py,sha256=mA8ipIlOBSA4eKc6xnRz-IFyM7fu_kIQ5FV2r4vE2rs,7593 +pyarrow/tests/test_deprecations.py,sha256=W_rneq4jC6zqCNoGhBDf1F28Q-0LHI7YKLgtsbV6LHM,891 +pyarrow/tests/test_device.py,sha256=qe9Wiwo-XVazt9pdxyqQJUz6fNR0jTs9CHiyaoppNA4,2550 +pyarrow/tests/test_dlpack.py,sha256=3s23cDst8TaUdum_v4XrWBJ9Ny5q4-b20vJJlHJLI8o,4937 +pyarrow/tests/test_exec_plan.py,sha256=pjOkSaWeqjN6celKxUEH3tBGXLh8kKbmSSsvKOWsbQQ,10096 +pyarrow/tests/test_extension_type.py,sha256=gKukBp0ial_3-iBeLsLIJKN-4ayn1K7P7auil2luH1U,65617 +pyarrow/tests/test_feather.py,sha256=Rw8J4upIZhR0GMe17n84IFGItlBUk9qpHOCWmDWyCuw,25074 +pyarrow/tests/test_flight.py,sha256=9kJlmuwCSzKrilP3UMeA20cyZJwlRB_pqGavbRM0Y7E,87152 +pyarrow/tests/test_flight_async.py,sha256=g_mNqrnNBp7GWNOWZgnVklZcVKV_vvAAChDgcQICNdo,2873 +pyarrow/tests/test_fs.py,sha256=n-RuiqvfK9zWkmmuhHLSZp3v5pRR1f258YKB6R5DsdI,65418 +pyarrow/tests/test_gandiva.py,sha256=AEf9ln-j5MmIMQ0JTQPhnZwbNh82ynSURsWPaKaNing,15623 +pyarrow/tests/test_gdb.py,sha256=OJzMfZtev3YOKJBm2QxnE-q-9-exy2JLhxpiVhY3T_0,44938 +pyarrow/tests/test_io.py,sha256=T9Vdg1rPGjdAp7nd5U9TAc3mN0N4oWvlG-F8TKmMVS4,63768 +pyarrow/tests/test_ipc.py,sha256=JPW2Q3pXKi8Y4adbCkpGZeNjdP8C6Ot1TqapinKeO_Q,42746 +pyarrow/tests/test_json.py,sha256=P60OhNO7MqHWmppL7cKPmvFEYNMj0XdztxiNGxvjhkM,13169 +pyarrow/tests/test_jvm.py,sha256=pIrHUgnDdmwDoLgM2TFvdgfcEJTGtBGsPgzYIRU6jYY,15507 +pyarrow/tests/test_memory.py,sha256=FqCTUSCqZvKx4k-JDY3M83MvxQ15iNdMUbyUxACfS7w,8874 +pyarrow/tests/test_misc.py,sha256=5-P4nWTZXB7ObuCiVwsQgCjNJ8883tZh03EY4aWea4I,7227 +pyarrow/tests/test_orc.py,sha256=oijYMqsxPLYbpEy1NTwqlz-wiTd8aKttaZH6npXNXoY,19321 +pyarrow/tests/test_pandas.py,sha256=_X9K5EQuAMff5vjj0CPlw-Yoa2syFbjXAfWOoZKPPIA,188352 +pyarrow/tests/test_scalars.py,sha256=cKsl6QSB68aKTcHRI_sVXXonA-OgIOrkGjW3iAEIDT4,27654 +pyarrow/tests/test_schema.py,sha256=3ed2GtcKqio7XJMbl9HTN2XxqCLlhiVJBze7VIwxn8Q,21814 +pyarrow/tests/test_sparse_tensor.py,sha256=6Hp-X6PLqcUzTZCRSB-TyaAWR7ZyWf5YsWuZgixmd64,17500 +pyarrow/tests/test_strategies.py,sha256=HEL8T94h8VyukGKNRVAW_RyQ3m36REc2P4q2BQZ_PfY,1811 +pyarrow/tests/test_substrait.py,sha256=yGijuSlKRUndT80QMvuqfCx4135uAI7UjN89RYYiFCI,30634 +pyarrow/tests/test_table.py,sha256=WAYwyPK8jiGWd6H8BKjJsQGcFAsgT0zW-vO5d-7iyo8,120500 +pyarrow/tests/test_tensor.py,sha256=LYSEYGUjtdnsbL0WAir9jFindo-r0bLySiDA1uAXL8E,6643 +pyarrow/tests/test_types.py,sha256=KH-BLjbSuQ17ySb0qr8ZUsYiHNfy_fGHuTsA-Ypr4Og,42056 +pyarrow/tests/test_udf.py,sha256=WA9E5skUqh7uMr_zH3rQ11LRx0SK2G3WO8HjVHGWyQY,29792 +pyarrow/tests/test_util.py,sha256=ozTlooHBMOP3nbX5b3dG2aanrXwxXHx1giicm0QQyPM,5030 +pyarrow/tests/test_without_numpy.py,sha256=ysbB-jML318I04ViQT4Ok7iMg1cI-NU8kguPu-FTSl4,1855 +pyarrow/tests/util.py,sha256=YeH8RovBtKY4L1SJqcOOObEZx0Yf6HSpkkq4xJdKL5U,13275 +pyarrow/tests/wsgi_examples.py,sha256=vQIDb5989sRVLsELw-fRHhfX-dE96sTl5J2lEuEKup8,1348 +pyarrow/types.pxi,sha256=dPCKGp91crrmtwOfkotcsh0QNVPrmOdQQVqOuaHbCao,157764 +pyarrow/types.py,sha256=Woixb8A_OzBNtolWwwFGhbEWn10gavaB7S0wGMoFakQ,7240 +pyarrow/util.py,sha256=W0LXUR7nsrA5N-l3THD283bxCibS0sM1q6WLcfbFFz8,7970 +pyarrow/vendored/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785 +pyarrow/vendored/__pycache__/__init__.cpython-312.pyc,, +pyarrow/vendored/__pycache__/docscrape.cpython-312.pyc,, +pyarrow/vendored/__pycache__/version.cpython-312.pyc,, +pyarrow/vendored/docscrape.py,sha256=phTjwuzoO5hB88QerZk3uGu9c5OrZwjFzI7vEIIbCUQ,22975 +pyarrow/vendored/version.py,sha256=5-Vo4Q3kPJrm1DSGusnMlTxuA8ynI4hAryApBd6MnpQ,14345 diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/REQUESTED b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/REQUESTED new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/WHEEL b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..cc9e52b5e838511fd027bbbb4a8f8bc137509e88 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: setuptools (72.1.0) +Root-Is-Purelib: false +Tag: cp312-cp312-manylinux_2_28_x86_64 + diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/top_level.txt b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..652a7f20a026b7151711b81e752207ae1bcfce96 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow-18.0.0.dist-info/top_level.txt @@ -0,0 +1,2 @@ +__dummy__ +pyarrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__init__.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__init__.pxd new file mode 100644 index 0000000000000000000000000000000000000000..8cc54b4c6bfdaa0e347b3927d7932934916a1ade --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__init__.pxd @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from libcpp.memory cimport shared_ptr +from pyarrow.includes.libarrow cimport (CArray, CBuffer, CDataType, + CField, CRecordBatch, CSchema, + CTable, CTensor, CSparseCOOTensor, + CSparseCSRMatrix, CSparseCSCMatrix, + CSparseCSFTensor) + +cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py": + cdef int import_pyarrow() except -1 + cdef object wrap_buffer(const shared_ptr[CBuffer]& buffer) + cdef object wrap_data_type(const shared_ptr[CDataType]& type) + cdef object wrap_field(const shared_ptr[CField]& field) + cdef object wrap_schema(const shared_ptr[CSchema]& schema) + cdef object wrap_array(const shared_ptr[CArray]& sp_array) + cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor) + cdef object wrap_sparse_tensor_coo( + const shared_ptr[CSparseCOOTensor]& sp_sparse_tensor) + cdef object wrap_sparse_tensor_csr( + const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor) + cdef object wrap_sparse_tensor_csc( + const shared_ptr[CSparseCSCMatrix]& sp_sparse_tensor) + cdef object wrap_sparse_tensor_csf( + const shared_ptr[CSparseCSFTensor]& sp_sparse_tensor) + cdef object wrap_table(const shared_ptr[CTable]& ctable) + cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__init__.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d31c93119b73a217cc5695ab688dec86467e8905 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__init__.py @@ -0,0 +1,435 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + +""" +PyArrow is the python implementation of Apache Arrow. + +Apache Arrow is a cross-language development platform for in-memory data. +It specifies a standardized language-independent columnar memory format for +flat and hierarchical data, organized for efficient analytic operations on +modern hardware. It also provides computational libraries and zero-copy +streaming messaging and interprocess communication. + +For more information see the official page at https://arrow.apache.org +""" + +import gc as _gc +import importlib as _importlib +import os as _os +import platform as _platform +import sys as _sys +import warnings as _warnings + +try: + from ._generated_version import version as __version__ +except ImportError: + # Package is not installed, parse git tag at runtime + try: + import setuptools_scm + # Code duplicated from setup.py to avoid a dependency on each other + + def parse_git(root, **kwargs): + """ + Parse function for setuptools_scm that ignores tags for non-C++ + subprojects, e.g. apache-arrow-js-XXX tags. + """ + from setuptools_scm.git import parse + kwargs['describe_command'] = \ + "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" + return parse(root, **kwargs) + __version__ = setuptools_scm.get_version('../', + parse=parse_git) + except ImportError: + __version__ = None + +# ARROW-8684: Disable GC while initializing Cython extension module, +# to workaround Cython bug in https://github.com/cython/cython/issues/3603 +_gc_enabled = _gc.isenabled() +_gc.disable() +import pyarrow.lib as _lib +if _gc_enabled: + _gc.enable() + +from pyarrow.lib import (BuildInfo, RuntimeInfo, set_timezone_db_path, + MonthDayNano, VersionInfo, cpp_build_info, + cpp_version, cpp_version_info, runtime_info, + cpu_count, set_cpu_count, enable_signal_handlers, + io_thread_count, set_io_thread_count) + + +def show_versions(): + """ + Print various version information, to help with error reporting. + """ + def print_entry(label, value): + print(f"{label: <26}: {value: <8}") + + print("pyarrow version info\n--------------------") + print_entry("Package kind", cpp_build_info.package_kind + if len(cpp_build_info.package_kind) > 0 + else "not indicated") + print_entry("Arrow C++ library version", cpp_build_info.version) + print_entry("Arrow C++ compiler", + f"{cpp_build_info.compiler_id} {cpp_build_info.compiler_version}") + print_entry("Arrow C++ compiler flags", cpp_build_info.compiler_flags) + print_entry("Arrow C++ git revision", cpp_build_info.git_id) + print_entry("Arrow C++ git description", cpp_build_info.git_description) + print_entry("Arrow C++ build type", cpp_build_info.build_type) + + +def _module_is_available(module): + try: + _importlib.import_module(f'pyarrow.{module}') + except ImportError: + return False + else: + return True + + +def _filesystem_is_available(fs): + try: + import pyarrow.fs + except ImportError: + return False + + try: + getattr(pyarrow.fs, fs) + except (ImportError, AttributeError): + return False + else: + return True + + +def show_info(): + """ + Print detailed version and platform information, for error reporting + """ + show_versions() + + def print_entry(label, value): + print(f" {label: <20}: {value: <8}") + + print("\nPlatform:") + print_entry("OS / Arch", f"{_platform.system()} {_platform.machine()}") + print_entry("SIMD Level", runtime_info().simd_level) + print_entry("Detected SIMD Level", runtime_info().detected_simd_level) + + pool = default_memory_pool() + print("\nMemory:") + print_entry("Default backend", pool.backend_name) + print_entry("Bytes allocated", f"{pool.bytes_allocated()} bytes") + print_entry("Max memory", f"{pool.max_memory()} bytes") + print_entry("Supported Backends", ', '.join(supported_memory_backends())) + + print("\nOptional modules:") + modules = ["csv", "cuda", "dataset", "feather", "flight", "fs", "gandiva", "json", + "orc", "parquet"] + for module in modules: + status = "Enabled" if _module_is_available(module) else "-" + print(f" {module: <20}: {status: <8}") + + print("\nFilesystems:") + filesystems = ["AzureFileSystem", "GcsFileSystem", + "HadoopFileSystem", "S3FileSystem"] + for fs in filesystems: + status = "Enabled" if _filesystem_is_available(fs) else "-" + print(f" {fs: <20}: {status: <8}") + + print("\nCompression Codecs:") + codecs = ["brotli", "bz2", "gzip", "lz4_frame", "lz4", "snappy", "zstd"] + for codec in codecs: + status = "Enabled" if Codec.is_available(codec) else "-" + print(f" {codec: <20}: {status: <8}") + + +from pyarrow.lib import (null, bool_, + int8, int16, int32, int64, + uint8, uint16, uint32, uint64, + time32, time64, timestamp, date32, date64, duration, + month_day_nano_interval, + float16, float32, float64, + binary, string, utf8, binary_view, string_view, + large_binary, large_string, large_utf8, + decimal128, decimal256, + list_, large_list, list_view, large_list_view, + map_, struct, + union, sparse_union, dense_union, + dictionary, + run_end_encoded, + bool8, fixed_shape_tensor, opaque, uuid, + field, + type_for_alias, + DataType, DictionaryType, StructType, + ListType, LargeListType, FixedSizeListType, + ListViewType, LargeListViewType, + MapType, UnionType, SparseUnionType, DenseUnionType, + TimestampType, Time32Type, Time64Type, DurationType, + FixedSizeBinaryType, Decimal128Type, Decimal256Type, + BaseExtensionType, ExtensionType, + RunEndEncodedType, Bool8Type, FixedShapeTensorType, + OpaqueType, UuidType, + PyExtensionType, UnknownExtensionType, + register_extension_type, unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, Tensor, + array, chunked_array, record_batch, nulls, repeat, + SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix, + SparseCSFTensor, + infer_type, from_numpy_dtype, + NullArray, + NumericArray, IntegerArray, FloatingPointArray, + BooleanArray, + Int8Array, UInt8Array, + Int16Array, UInt16Array, + Int32Array, UInt32Array, + Int64Array, UInt64Array, + HalfFloatArray, FloatArray, DoubleArray, + ListArray, LargeListArray, FixedSizeListArray, + ListViewArray, LargeListViewArray, + MapArray, UnionArray, + BinaryArray, StringArray, + LargeBinaryArray, LargeStringArray, + BinaryViewArray, StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, Date64Array, TimestampArray, + Time32Array, Time64Array, DurationArray, + MonthDayNanoIntervalArray, + Decimal128Array, Decimal256Array, StructArray, ExtensionArray, + RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, + OpaqueArray, UuidArray, + scalar, NA, _NULL as NULL, Scalar, + NullScalar, BooleanScalar, + Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, + UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, + HalfFloatScalar, FloatScalar, DoubleScalar, + Decimal128Scalar, Decimal256Scalar, + ListScalar, LargeListScalar, FixedSizeListScalar, + ListViewScalar, LargeListViewScalar, + Date32Scalar, Date64Scalar, + Time32Scalar, Time64Scalar, + TimestampScalar, DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, LargeBinaryScalar, BinaryViewScalar, + StringScalar, LargeStringScalar, StringViewScalar, + FixedSizeBinaryScalar, DictionaryScalar, + MapScalar, StructScalar, UnionScalar, + RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, + FixedShapeTensorScalar, OpaqueScalar, UuidScalar) + +# Buffers, allocation +from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, + default_cpu_memory_manager) + +from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, + Codec, compress, decompress, allocate_buffer) + +from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, + total_allocated_bytes, set_memory_pool, + default_memory_pool, system_memory_pool, + jemalloc_memory_pool, mimalloc_memory_pool, + logging_memory_pool, proxy_memory_pool, + log_memory_allocations, jemalloc_set_decay_ms, + supported_memory_backends) + +# I/O +from pyarrow.lib import (NativeFile, PythonFile, + BufferedInputStream, BufferedOutputStream, CacheOptions, + CompressedInputStream, CompressedOutputStream, + TransformInputStream, transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, BufferOutputStream, + OSFile, MemoryMappedFile, memory_map, + create_memory_map, MockOutputStream, + input_stream, output_stream, + have_libhdfs) + +from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, + concat_arrays, concat_tables, TableGroupBy, + RecordBatchReader) + +# Exceptions +from pyarrow.lib import (ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc + +import pyarrow.types as types + + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + + +# TODO: Deprecate these somehow in the pyarrow namespace +from pyarrow.ipc import (Message, MessageReader, MetadataVersion, + RecordBatchFileReader, RecordBatchFileWriter, + RecordBatchStreamReader, RecordBatchStreamWriter) + +# ---------------------------------------------------------------------- +# Returning absolute path to the pyarrow include directory (if bundled, e.g. in +# wheels) + + +def get_include(): + """ + Return absolute path to directory containing Arrow C++ include + headers. Similar to numpy.get_include + """ + return _os.path.join(_os.path.dirname(__file__), 'include') + + +def _get_pkg_config_executable(): + return _os.environ.get('PKG_CONFIG', 'pkg-config') + + +def _has_pkg_config(pkgname): + import subprocess + try: + return subprocess.call([_get_pkg_config_executable(), + '--exists', pkgname]) == 0 + except FileNotFoundError: + return False + + +def _read_pkg_config_variable(pkgname, cli_args): + import subprocess + cmd = [_get_pkg_config_executable(), pkgname] + cli_args + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = proc.communicate() + if proc.returncode != 0: + raise RuntimeError("pkg-config failed: " + err.decode('utf8')) + return out.rstrip().decode('utf8') + + +def get_libraries(): + """ + Return list of library names to include in the `libraries` argument for C + or Cython extensions using pyarrow + """ + return ['arrow_python', 'arrow'] + + +def create_library_symlinks(): + """ + With Linux and macOS wheels, the bundled shared libraries have an embedded + ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them + with -larrow won't work unless we create symlinks at locations like + site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses + prior problems we had with shipping two copies of the shared libraries to + permit third party projects like turbodbc to build their C++ extensions + against the pyarrow wheels. + + This function must only be invoked once and only when the shared libraries + are bundled with the Python package, which should only apply to wheel-based + installs. It requires write access to the site-packages/pyarrow directory + and so depending on your system may need to be run with root. + """ + import glob + if _sys.platform == 'win32': + return + package_cwd = _os.path.dirname(__file__) + + if _sys.platform == 'linux': + bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*')) + + def get_symlink_path(hard_path): + return hard_path.rsplit('.', 1)[0] + else: + bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib')) + + def get_symlink_path(hard_path): + return '.'.join((hard_path.rsplit('.', 2)[0], 'dylib')) + + for lib_hard_path in bundled_libs: + symlink_path = get_symlink_path(lib_hard_path) + if _os.path.exists(symlink_path): + continue + try: + _os.symlink(lib_hard_path, symlink_path) + except PermissionError: + print("Tried creating symlink {}. If you need to link to " + "bundled shared libraries, run " + "pyarrow.create_library_symlinks() as root") + + +def get_library_dirs(): + """ + Return lists of directories likely to contain Arrow C++ libraries for + linking C or Cython extensions using pyarrow + """ + package_cwd = _os.path.dirname(__file__) + library_dirs = [package_cwd] + + def append_library_dir(library_dir): + if library_dir not in library_dirs: + library_dirs.append(library_dir) + + # Search library paths via pkg-config. This is necessary if the user + # installed libarrow and the other shared libraries manually and they + # are not shipped inside the pyarrow package (see also ARROW-2976). + pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config' + for pkgname in ["arrow", "arrow_python"]: + if _has_pkg_config(pkgname): + library_dir = _read_pkg_config_variable(pkgname, + ["--libs-only-L"]) + # pkg-config output could be empty if Arrow is installed + # as a system package. + if library_dir: + if not library_dir.startswith("-L"): + raise ValueError( + "pkg-config --libs-only-L returned unexpected " + "value {!r}".format(library_dir)) + append_library_dir(library_dir[2:]) + + if _sys.platform == 'win32': + # TODO(wesm): Is this necessary, or does setuptools within a conda + # installation add Library\lib to the linker path for MSVC? + python_base_install = _os.path.dirname(_sys.executable) + library_dir = _os.path.join(python_base_install, 'Library', 'lib') + + if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')): + append_library_dir(library_dir) + + # ARROW-4074: Allow for ARROW_HOME to be set to some other directory + if _os.environ.get('ARROW_HOME'): + append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib')) + else: + # Python wheels bundle the Arrow libraries in the pyarrow directory. + append_library_dir(_os.path.dirname(_os.path.abspath(__file__))) + + return library_dirs diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/__init__.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4be8aa76e30ca89fcf1b319ee1f86da9b1a400ed Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/__init__.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/_compute_docstrings.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/_compute_docstrings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba16c0d54b5c66892ee374eeba125b24c14cf087 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/_compute_docstrings.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/_generated_version.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/_generated_version.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..900d4a9105dd1dcdad5d9da762e921656463a2d7 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/_generated_version.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/acero.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/acero.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7fc65c864da7c2275a1215722a220143b0209705 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/acero.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/benchmark.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/benchmark.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f6c91bd6a773f0c456559e2f5a72882b20639b7 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/benchmark.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/cffi.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/cffi.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2afe8d23cf2668bbad2efb4a50f8182033aabc9 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/cffi.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/compute.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/compute.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5c71914b82b1565a26d310e87a92b5cddf27867 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/compute.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/conftest.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0372f472ce0eb5e2c7bf194433b83f676f59da6b Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/conftest.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/csv.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/csv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99528d8e48a9656a1c7aebba15ba5d82b73ceeaa Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/csv.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/cuda.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/cuda.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca69b12313c9a15c6cd0830320e714c1c84ae234 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/cuda.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/dataset.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/dataset.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e56f715d139f5b7ee892b9dba08a4b6d410db3f Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/dataset.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/feather.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/feather.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80dd3cd4cf34929990b7a44e279a8e3d19c19798 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/feather.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/flight.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/flight.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8eb1b2a6a80f36280c6cb6634904261d09f69f95 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/flight.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/fs.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/fs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ae81bf590f0b2cfbd033a798634be21efc60de1 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/fs.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/ipc.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/ipc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9267898ebb4b124d406927d07075e0b0f80f5ade Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/ipc.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/json.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/json.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3931c877cb8132242e24012f9353d18c58fb03b2 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/json.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/jvm.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/jvm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d9e9338009efc6639c5daaa129c9212ca0be37f Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/jvm.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/orc.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/orc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e18f0ff901e9d69feb5b07706e5e2440f03d4d76 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/orc.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/pandas_compat.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/pandas_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..557efd93198c725c98250930a7ed6aa9eb741cbd Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/pandas_compat.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/substrait.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/substrait.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0b9eff4ddade1de2f09a4b56eadbad0db04eae9 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/substrait.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/types.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8404839f6561ed3f45fde35e61b326ecdc00837e Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/types.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/util.cpython-312.pyc b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/util.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aba50e7178608f2b64c1b752f5fdf02d5124e6d0 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/__pycache__/util.cpython-312.pyc differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..90541d268800dcb64eba9f5706f3e691bb687a6c Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.pxd new file mode 100644 index 0000000000000000000000000000000000000000..4553aee9d6f16c391340aa45489471bdcfe0cb76 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.pxd @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.lib cimport * +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_acero cimport * + + +cdef class ExecNodeOptions(_Weakrefable): + cdef: + shared_ptr[CExecNodeOptions] wrapped + + cdef void init(self, const shared_ptr[CExecNodeOptions]& sp) + cdef inline shared_ptr[CExecNodeOptions] unwrap(self) nogil + + +cdef class Declaration(_Weakrefable): + + cdef: + CDeclaration decl + + cdef void init(self, const CDeclaration& c_decl) + + @staticmethod + cdef wrap(const CDeclaration& c_decl) + + cdef inline CDeclaration unwrap(self) nogil diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.pyx new file mode 100644 index 0000000000000000000000000000000000000000..9e8cbd65be224bb255448b580b44f0575942fc1e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_acero.pyx @@ -0,0 +1,608 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# --------------------------------------------------------------------- +# Low-level Acero bindings + +# cython: profile=False +# distutils: language = c++ +# cython: language_level = 3 + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_acero cimport * +from pyarrow.lib cimport (Table, pyarrow_unwrap_table, pyarrow_wrap_table, + RecordBatchReader) +from pyarrow.lib import frombytes, tobytes +from pyarrow._compute cimport ( + Expression, FunctionOptions, _ensure_field_ref, _true, + unwrap_null_placement, unwrap_sort_order +) + + +cdef class ExecNodeOptions(_Weakrefable): + """ + Base class for the node options. + + Use one of the subclasses to construct an options object. + """ + __slots__ = () # avoid mistakingly creating attributes + + cdef void init(self, const shared_ptr[CExecNodeOptions]& sp): + self.wrapped = sp + + cdef inline shared_ptr[CExecNodeOptions] unwrap(self) nogil: + return self.wrapped + + +cdef class _TableSourceNodeOptions(ExecNodeOptions): + + def _set_options(self, Table table): + cdef: + shared_ptr[CTable] c_table + + c_table = pyarrow_unwrap_table(table) + self.wrapped.reset( + new CTableSourceNodeOptions(c_table) + ) + + +class TableSourceNodeOptions(_TableSourceNodeOptions): + """ + A Source node which accepts a table. + + This is the option class for the "table_source" node factory. + + Parameters + ---------- + table : pyarrow.Table + The table which acts as the data source. + """ + + def __init__(self, Table table): + self._set_options(table) + + +cdef class _FilterNodeOptions(ExecNodeOptions): + + def _set_options(self, Expression filter_expression not None): + self.wrapped.reset( + new CFilterNodeOptions(filter_expression.unwrap()) + ) + + +class FilterNodeOptions(_FilterNodeOptions): + """ + Make a node which excludes some rows from batches passed through it. + + This is the option class for the "filter" node factory. + + The "filter" operation provides an option to define data filtering + criteria. It selects rows where the given expression evaluates to true. + Filters can be written using pyarrow.compute.Expression, and the + expression must have a return type of boolean. + + Parameters + ---------- + filter_expression : pyarrow.compute.Expression + """ + + def __init__(self, Expression filter_expression): + self._set_options(filter_expression) + + +cdef class _ProjectNodeOptions(ExecNodeOptions): + + def _set_options(self, expressions, names=None): + cdef: + Expression expr + vector[CExpression] c_expressions + vector[c_string] c_names + + for expr in expressions: + c_expressions.push_back(expr.unwrap()) + + if names is not None: + if len(names) != len(expressions): + raise ValueError( + "The number of names should be equal to the number of expressions" + ) + + for name in names: + c_names.push_back(tobytes(name)) + + self.wrapped.reset( + new CProjectNodeOptions(c_expressions, c_names) + ) + else: + self.wrapped.reset( + new CProjectNodeOptions(c_expressions) + ) + + +class ProjectNodeOptions(_ProjectNodeOptions): + """ + Make a node which executes expressions on input batches, + producing batches of the same length with new columns. + + This is the option class for the "project" node factory. + + The "project" operation rearranges, deletes, transforms, and + creates columns. Each output column is computed by evaluating + an expression against the source record batch. These must be + scalar expressions (expressions consisting of scalar literals, + field references and scalar functions, i.e. elementwise functions + that return one value for each input row independent of the value + of all other rows). + + Parameters + ---------- + expressions : list of pyarrow.compute.Expression + List of expressions to evaluate against the source batch. This must + be scalar expressions. + names : list of str, optional + List of names for each of the output columns (same length as + `expressions`). If `names` is not provided, the string + representations of exprs will be used. + """ + + def __init__(self, expressions, names=None): + self._set_options(expressions, names) + + +cdef class _AggregateNodeOptions(ExecNodeOptions): + + def _set_options(self, aggregates, keys=None): + cdef: + CAggregate c_aggr + vector[CAggregate] c_aggregations + vector[CFieldRef] c_keys + + for arg_names, func_name, opts, name in aggregates: + c_aggr.function = tobytes(func_name) + if opts is not None: + c_aggr.options = (opts).wrapped + else: + c_aggr.options = nullptr + if not isinstance(arg_names, (list, tuple)): + arg_names = [arg_names] + for arg in arg_names: + c_aggr.target.push_back(_ensure_field_ref(arg)) + c_aggr.name = tobytes(name) + + c_aggregations.push_back(move(c_aggr)) + + if keys is None: + keys = [] + for name in keys: + c_keys.push_back(_ensure_field_ref(name)) + + self.wrapped.reset( + new CAggregateNodeOptions(c_aggregations, c_keys) + ) + + +class AggregateNodeOptions(_AggregateNodeOptions): + """ + Make a node which aggregates input batches, optionally grouped by keys. + + This is the option class for the "aggregate" node factory. + + Acero supports two types of aggregates: "scalar" aggregates, + and "hash" aggregates. Scalar aggregates reduce an array or scalar + input to a single scalar output (e.g. computing the mean of a column). + Hash aggregates act like GROUP BY in SQL and first partition data + based on one or more key columns, then reduce the data in each partition. + The aggregate node supports both types of computation, and can compute + any number of aggregations at once. + + Parameters + ---------- + aggregates : list of tuples + Aggregations which will be applied to the targeted fields. + Specified as a list of tuples, where each tuple is one aggregation + specification and consists of: aggregation target column(s) followed + by function name, aggregation function options object and the + output field name. + The target column(s) specification can be a single field reference, + an empty list or a list of fields unary, nullary and n-ary aggregation + functions respectively. Each field reference can be a string + column name or expression. + keys : list of field references, optional + Keys by which aggregations will be grouped. Each key can reference + a field using a string name or expression. + """ + + def __init__(self, aggregates, keys=None): + self._set_options(aggregates, keys) + + +cdef class _OrderByNodeOptions(ExecNodeOptions): + + def _set_options(self, sort_keys, null_placement): + cdef: + vector[CSortKey] c_sort_keys + + for name, order in sort_keys: + c_sort_keys.push_back( + CSortKey(_ensure_field_ref(name), unwrap_sort_order(order)) + ) + + self.wrapped.reset( + new COrderByNodeOptions( + COrdering(c_sort_keys, unwrap_null_placement(null_placement)) + ) + ) + + +class OrderByNodeOptions(_OrderByNodeOptions): + """ + Make a node which applies a new ordering to the data. + + Currently this node works by accumulating all data, sorting, and then + emitting the new data with an updated batch index. + Larger-than-memory sort is not currently supported. + + This is the option class for the "order_by" node factory. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + Each field reference can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + """ + + def __init__(self, sort_keys=(), *, null_placement="at_end"): + self._set_options(sort_keys, null_placement) + + +cdef class _HashJoinNodeOptions(ExecNodeOptions): + + def _set_options( + self, join_type, left_keys, right_keys, left_output=None, right_output=None, + output_suffix_for_left="", output_suffix_for_right="", + ): + cdef: + CJoinType c_join_type + vector[CFieldRef] c_left_keys + vector[CFieldRef] c_right_keys + vector[CFieldRef] c_left_output + vector[CFieldRef] c_right_output + + # join type + if join_type == "left semi": + c_join_type = CJoinType_LEFT_SEMI + elif join_type == "right semi": + c_join_type = CJoinType_RIGHT_SEMI + elif join_type == "left anti": + c_join_type = CJoinType_LEFT_ANTI + elif join_type == "right anti": + c_join_type = CJoinType_RIGHT_ANTI + elif join_type == "inner": + c_join_type = CJoinType_INNER + elif join_type == "left outer": + c_join_type = CJoinType_LEFT_OUTER + elif join_type == "right outer": + c_join_type = CJoinType_RIGHT_OUTER + elif join_type == "full outer": + c_join_type = CJoinType_FULL_OUTER + else: + raise ValueError("Unsupported join type") + + # left/right keys + if not isinstance(left_keys, (list, tuple)): + left_keys = [left_keys] + for key in left_keys: + c_left_keys.push_back(_ensure_field_ref(key)) + if not isinstance(right_keys, (list, tuple)): + right_keys = [right_keys] + for key in right_keys: + c_right_keys.push_back(_ensure_field_ref(key)) + + # left/right output fields + if left_output is not None and right_output is not None: + for colname in left_output: + c_left_output.push_back(_ensure_field_ref(colname)) + for colname in right_output: + c_right_output.push_back(_ensure_field_ref(colname)) + + self.wrapped.reset( + new CHashJoinNodeOptions( + c_join_type, c_left_keys, c_right_keys, + c_left_output, c_right_output, + _true, + tobytes(output_suffix_for_left), + tobytes(output_suffix_for_right) + ) + ) + else: + self.wrapped.reset( + new CHashJoinNodeOptions( + c_join_type, c_left_keys, c_right_keys, + _true, + tobytes(output_suffix_for_left), + tobytes(output_suffix_for_right) + ) + ) + + +class HashJoinNodeOptions(_HashJoinNodeOptions): + """ + Make a node which implements join operation using hash join strategy. + + This is the option class for the "hashjoin" node factory. + + Parameters + ---------- + join_type : str + Type of join. One of "left semi", "right semi", "left anti", + "right anti", "inner", "left outer", "right outer", "full outer". + left_keys : str, Expression or list + Key fields from left input. Each key can be a string column name + or a field expression, or a list of such field references. + right_keys : str, Expression or list + Key fields from right input. See `left_keys` for details. + left_output : list, optional + List of output fields passed from left input. If left and right + output fields are not specified, all valid fields from both left and + right input will be output. Each field can be a string column name + or a field expression. + right_output : list, optional + List of output fields passed from right input. If left and right + output fields are not specified, all valid fields from both left and + right input will be output. Each field can be a string column name + or a field expression. + output_suffix_for_left : str + Suffix added to names of output fields coming from left input + (used to distinguish, if necessary, between fields of the same + name in left and right input and can be left empty if there are + no name collisions). + output_suffix_for_right : str + Suffix added to names of output fields coming from right input, + see `output_suffix_for_left` for details. + """ + + def __init__( + self, join_type, left_keys, right_keys, left_output=None, right_output=None, + output_suffix_for_left="", output_suffix_for_right="" + ): + self._set_options( + join_type, left_keys, right_keys, left_output, right_output, + output_suffix_for_left, output_suffix_for_right + ) + + +cdef class _AsofJoinNodeOptions(ExecNodeOptions): + + def _set_options(self, left_on, left_by, right_on, right_by, tolerance): + cdef: + vector[CFieldRef] c_left_by + vector[CFieldRef] c_right_by + CAsofJoinKeys c_left_keys + CAsofJoinKeys c_right_keys + vector[CAsofJoinKeys] c_input_keys + + # Prepare left AsofJoinNodeOption::Keys + if not isinstance(left_by, (list, tuple)): + left_by = [left_by] + for key in left_by: + c_left_by.push_back(_ensure_field_ref(key)) + + c_left_keys.on_key = _ensure_field_ref(left_on) + c_left_keys.by_key = c_left_by + + c_input_keys.push_back(c_left_keys) + + # Prepare right AsofJoinNodeOption::Keys + if not isinstance(right_by, (list, tuple)): + right_by = [right_by] + for key in right_by: + c_right_by.push_back(_ensure_field_ref(key)) + + c_right_keys.on_key = _ensure_field_ref(right_on) + c_right_keys.by_key = c_right_by + + c_input_keys.push_back(c_right_keys) + + self.wrapped.reset( + new CAsofJoinNodeOptions( + c_input_keys, + tolerance, + ) + ) + + +class AsofJoinNodeOptions(_AsofJoinNodeOptions): + """ + Make a node which implements 'as of join' operation. + + This is the option class for the "asofjoin" node factory. + + Parameters + ---------- + left_on : str, Expression + The left key on which the join operation should be performed. + Can be a string column name or a field expression. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input dataset must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + left_by: str, Expression or list + The left keys on which the join operation should be performed. + Exact equality is used for each field of the "by" keys. + Each key can be a string column name or a field expression, + or a list of such field references. + right_on : str, Expression + The right key on which the join operation should be performed. + See `left_on` for details. + right_by: str, Expression or list + The right keys on which the join operation should be performed. + See `left_by` for details. + tolerance : int + The tolerance to use for the asof join. The tolerance is interpreted in + the same units as the "on" key. + """ + + def __init__(self, left_on, left_by, right_on, right_by, tolerance): + self._set_options(left_on, left_by, right_on, right_by, tolerance) + + +cdef class Declaration(_Weakrefable): + """ + Helper class for declaring the nodes of an ExecPlan. + + A Declaration represents an unconstructed ExecNode, and potentially + more since its inputs may also be Declarations or when constructed + with ``from_sequence``. + + The possible ExecNodes to use are registered with a name, + the "factory name", and need to be specified using this name, together + with its corresponding ExecNodeOptions subclass. + + Parameters + ---------- + factory_name : str + The ExecNode factory name, such as "table_source", "filter", + "project" etc. See the ExecNodeOptions subclasses for the exact + factory names to use. + options : ExecNodeOptions + Corresponding ExecNodeOptions subclass (matching the factory name). + inputs : list of Declaration, optional + Input nodes for this declaration. Optional if the node is a source + node, or when the declaration gets combined later with + ``from_sequence``. + + Returns + ------- + Declaration + """ + cdef void init(self, const CDeclaration& c_decl): + self.decl = c_decl + + @staticmethod + cdef wrap(const CDeclaration& c_decl): + cdef Declaration self = Declaration.__new__(Declaration) + self.init(c_decl) + return self + + cdef inline CDeclaration unwrap(self) nogil: + return self.decl + + def __init__(self, factory_name, ExecNodeOptions options, inputs=None): + cdef: + c_string c_factory_name + CDeclaration c_decl + vector[CDeclaration.Input] c_inputs + + c_factory_name = tobytes(factory_name) + + if inputs is not None: + for ipt in inputs: + c_inputs.push_back( + CDeclaration.Input((ipt).unwrap()) + ) + + c_decl = CDeclaration(c_factory_name, c_inputs, options.unwrap()) + self.init(c_decl) + + @staticmethod + def from_sequence(decls): + """ + Convenience factory for the common case of a simple sequence of nodes. + + Each of the declarations will be appended to the inputs of the + subsequent declaration, and the final modified declaration will + be returned. + + Parameters + ---------- + decls : list of Declaration + + Returns + ------- + Declaration + """ + cdef: + vector[CDeclaration] c_decls + CDeclaration c_decl + + for decl in decls: + c_decls.push_back(( decl).unwrap()) + + c_decl = CDeclaration.Sequence(c_decls) + return Declaration.wrap(c_decl) + + def __str__(self): + return frombytes(GetResultValue(DeclarationToString(self.decl))) + + def __repr__(self): + return "\n{0}".format(str(self)) + + def to_table(self, bint use_threads=True): + """ + Run the declaration and collect the results into a table. + + This method will implicitly add a sink node to the declaration + to collect results into a table. It will then create an ExecPlan + from the declaration, start the exec plan, block until the plan + has finished, and return the created table. + + Parameters + ---------- + use_threads : bool, default True + If set to False, then all CPU work will be done on the calling + thread. I/O tasks will still happen on the I/O executor + and may be multi-threaded (but should not use significant CPU + resources). + + Returns + ------- + pyarrow.Table + """ + cdef: + shared_ptr[CTable] c_table + + with nogil: + c_table = GetResultValue(DeclarationToTable(self.unwrap(), use_threads)) + return pyarrow_wrap_table(c_table) + + def to_reader(self, bint use_threads=True): + """Run the declaration and return results as a RecordBatchReader. + + For details about the parameters, see `to_table`. + + Returns + ------- + pyarrow.RecordBatchReader + """ + cdef: + RecordBatchReader reader + reader = RecordBatchReader.__new__(RecordBatchReader) + reader.reader.reset( + GetResultValue(DeclarationToReader(self.unwrap(), use_threads)).release() + ) + return reader diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_azurefs.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_azurefs.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..e7168783ebd9d4479ed19fef51f98fd78fc0daa5 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_azurefs.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_azurefs.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_azurefs.pyx new file mode 100644 index 0000000000000000000000000000000000000000..5cd6300c18c6a83e7036d84724666ba85396b530 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_azurefs.pyx @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from cython cimport binding + + +from pyarrow.lib import frombytes, tobytes +from pyarrow.includes.libarrow_fs cimport * +from pyarrow._fs cimport FileSystem + + +cdef class AzureFileSystem(FileSystem): + """ + Azure Blob Storage backed FileSystem implementation + + This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. + Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific + features will be used when they provide a performance advantage. Azurite emulator is + also supported. Note: `/` is the only supported delimiter. + + The storage account is considered the root of the filesystem. When enabled, containers + will be created or deleted during relevant directory operations. Obviously, this also + requires authentication with the additional permissions. + + By default `DefaultAzureCredential `__ + is used for authentication. This means it will try several types of authentication + and go with the first one that works. If any authentication parameters are provided when + initialising the FileSystem, they will be used instead of the default credential. + + Parameters + ---------- + account_name : str + Azure Blob Storage account name. This is the globally unique identifier for the + storage account. + account_key : str, default None + Account key of the storage account. Pass None to use default credential. + blob_storage_authority : str, default None + hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful + for connecting to a local emulator, like Azurite. + dfs_storage_authority : str, default None + hostname[:port] of the Data Lake Gen 2 Service. Defaults to + `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. + blob_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + dfs_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + + Examples + -------- + >>> from pyarrow import fs + >>> azure_fs = fs.AzureFileSystem(account_name='myaccount') + >>> azurite_fs = fs.AzureFileSystem( + ... account_name='devstoreaccount1', + ... account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + ... blob_storage_authority='127.0.0.1:10000', + ... dfs_storage_authority='127.0.0.1:10000', + ... blob_storage_scheme='http', + ... dfs_storage_scheme='http', + ... ) + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + cdef: + CAzureFileSystem* azurefs + c_string account_key + + def __init__(self, account_name, *, account_key=None, blob_storage_authority=None, + dfs_storage_authority=None, blob_storage_scheme=None, + dfs_storage_scheme=None): + cdef: + CAzureOptions options + shared_ptr[CAzureFileSystem] wrapped + + options.account_name = tobytes(account_name) + if blob_storage_authority: + options.blob_storage_authority = tobytes(blob_storage_authority) + if dfs_storage_authority: + options.dfs_storage_authority = tobytes(dfs_storage_authority) + if blob_storage_scheme: + options.blob_storage_scheme = tobytes(blob_storage_scheme) + if dfs_storage_scheme: + options.dfs_storage_scheme = tobytes(dfs_storage_scheme) + + if account_key: + options.ConfigureAccountKeyCredential(tobytes(account_key)) + self.account_key = tobytes(account_key) + else: + options.ConfigureDefaultCredential() + + with nogil: + wrapped = GetResultValue(CAzureFileSystem.Make(options)) + + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.azurefs = wrapped.get() + + @staticmethod + @binding(True) # Required for cython < 3 + def _reconstruct(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + return AzureFileSystem(**kwargs) + + def __reduce__(self): + cdef CAzureOptions opts = self.azurefs.options() + return ( + AzureFileSystem._reconstruct, (dict( + account_name=frombytes(opts.account_name), + account_key=frombytes(self.account_key), + blob_storage_authority=frombytes(opts.blob_storage_authority), + dfs_storage_authority=frombytes(opts.dfs_storage_authority), + blob_storage_scheme=frombytes(opts.blob_storage_scheme), + dfs_storage_scheme=frombytes(opts.dfs_storage_scheme) + ),)) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..95899396ce4ea4e599ff7a7b3ce2030cfc228675 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.cpython-312-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbbe645ea6b09731d207cc08441fb7c2a7b4376d909d01e59abf736fcaf6fc0d +size 1367552 diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.pxd new file mode 100644 index 0000000000000000000000000000000000000000..29b37da3ac4ef36106b10a09d7583bdba8d1a260 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.pxd @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.lib cimport * +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * + +cdef class UdfContext(_Weakrefable): + cdef: + CUdfContext c_context + + cdef void init(self, const CUdfContext& c_context) + + +cdef class FunctionOptions(_Weakrefable): + cdef: + shared_ptr[CFunctionOptions] wrapped + + cdef const CFunctionOptions* get_options(self) except NULL + cdef void init(self, const shared_ptr[CFunctionOptions]& sp) + + cdef inline shared_ptr[CFunctionOptions] unwrap(self) + + +cdef class _SortOptions(FunctionOptions): + pass + + +cdef CExpression _bind(Expression filter, Schema schema) except * + + +cdef class Expression(_Weakrefable): + + cdef: + CExpression expr + + cdef void init(self, const CExpression& sp) + + @staticmethod + cdef wrap(const CExpression& sp) + + cdef inline CExpression unwrap(self) + + @staticmethod + cdef Expression _expr_or_scalar(object expr) + + +cdef CExpression _true + +cdef CFieldRef _ensure_field_ref(value) except * + +cdef CSortOrder unwrap_sort_order(order) except * + +cdef CNullPlacement unwrap_null_placement(null_placement) except * diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.pyx new file mode 100644 index 0000000000000000000000000000000000000000..d39120934d5fd582c14b9ea478585118659bff6b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute.pyx @@ -0,0 +1,3274 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +import sys + +from cpython.object cimport Py_LT, Py_EQ, Py_GT, Py_LE, Py_NE, Py_GE +from cython.operator cimport dereference as deref + +from collections import namedtuple + +from pyarrow.lib import frombytes, tobytes, ArrowInvalid +from pyarrow.lib cimport * +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +import pyarrow.lib as lib +from pyarrow.util import _DEPR_MSG +from libcpp cimport bool as c_bool + +import inspect +try: + import numpy as np +except ImportError: + np = None +import warnings + + +__pas = None +_substrait_msg = ( + "The pyarrow installation is not built with support for Substrait." +) + + +SUPPORTED_INPUT_ARR_TYPES = (list, tuple) +if np is not None: + SUPPORTED_INPUT_ARR_TYPES += (np.ndarray, ) + + +def _pas(): + global __pas + if __pas is None: + try: + import pyarrow.substrait as pas + __pas = pas + except ImportError: + raise ImportError(_substrait_msg) + return __pas + + +def _forbid_instantiation(klass, subclasses_instead=True): + msg = '{} is an abstract class thus cannot be initialized.'.format( + klass.__name__ + ) + if subclasses_instead: + subclasses = [cls.__name__ for cls in klass.__subclasses__] + msg += ' Use one of the subclasses instead: {}'.format( + ', '.join(subclasses) + ) + raise TypeError(msg) + + +cdef wrap_scalar_function(const shared_ptr[CFunction]& sp_func): + """ + Wrap a C++ scalar Function in a ScalarFunction object. + """ + cdef ScalarFunction func = ScalarFunction.__new__(ScalarFunction) + func.init(sp_func) + return func + + +cdef wrap_vector_function(const shared_ptr[CFunction]& sp_func): + """ + Wrap a C++ vector Function in a VectorFunction object. + """ + cdef VectorFunction func = VectorFunction.__new__(VectorFunction) + func.init(sp_func) + return func + + +cdef wrap_scalar_aggregate_function(const shared_ptr[CFunction]& sp_func): + """ + Wrap a C++ aggregate Function in a ScalarAggregateFunction object. + """ + cdef ScalarAggregateFunction func = \ + ScalarAggregateFunction.__new__(ScalarAggregateFunction) + func.init(sp_func) + return func + + +cdef wrap_hash_aggregate_function(const shared_ptr[CFunction]& sp_func): + """ + Wrap a C++ aggregate Function in a HashAggregateFunction object. + """ + cdef HashAggregateFunction func = \ + HashAggregateFunction.__new__(HashAggregateFunction) + func.init(sp_func) + return func + + +cdef wrap_meta_function(const shared_ptr[CFunction]& sp_func): + """ + Wrap a C++ meta Function in a MetaFunction object. + """ + cdef MetaFunction func = MetaFunction.__new__(MetaFunction) + func.init(sp_func) + return func + + +cdef wrap_function(const shared_ptr[CFunction]& sp_func): + """ + Wrap a C++ Function in a Function object. + + This dispatches to specialized wrappers depending on the function kind. + """ + if sp_func.get() == NULL: + raise ValueError("Function was NULL") + + cdef FunctionKind c_kind = sp_func.get().kind() + if c_kind == FunctionKind_SCALAR: + return wrap_scalar_function(sp_func) + elif c_kind == FunctionKind_VECTOR: + return wrap_vector_function(sp_func) + elif c_kind == FunctionKind_SCALAR_AGGREGATE: + return wrap_scalar_aggregate_function(sp_func) + elif c_kind == FunctionKind_HASH_AGGREGATE: + return wrap_hash_aggregate_function(sp_func) + elif c_kind == FunctionKind_META: + return wrap_meta_function(sp_func) + else: + raise NotImplementedError("Unknown Function::Kind") + + +cdef wrap_scalar_kernel(const CScalarKernel* c_kernel): + if c_kernel == NULL: + raise ValueError("Kernel was NULL") + cdef ScalarKernel kernel = ScalarKernel.__new__(ScalarKernel) + kernel.init(c_kernel) + return kernel + + +cdef wrap_vector_kernel(const CVectorKernel* c_kernel): + if c_kernel == NULL: + raise ValueError("Kernel was NULL") + cdef VectorKernel kernel = VectorKernel.__new__(VectorKernel) + kernel.init(c_kernel) + return kernel + + +cdef wrap_scalar_aggregate_kernel(const CScalarAggregateKernel* c_kernel): + if c_kernel == NULL: + raise ValueError("Kernel was NULL") + cdef ScalarAggregateKernel kernel = \ + ScalarAggregateKernel.__new__(ScalarAggregateKernel) + kernel.init(c_kernel) + return kernel + + +cdef wrap_hash_aggregate_kernel(const CHashAggregateKernel* c_kernel): + if c_kernel == NULL: + raise ValueError("Kernel was NULL") + cdef HashAggregateKernel kernel = \ + HashAggregateKernel.__new__(HashAggregateKernel) + kernel.init(c_kernel) + return kernel + + +cdef class Kernel(_Weakrefable): + """ + A kernel object. + + Kernels handle the execution of a Function for a certain signature. + """ + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly" + .format(self.__class__.__name__)) + + +cdef class ScalarKernel(Kernel): + cdef const CScalarKernel* kernel + + cdef void init(self, const CScalarKernel* kernel) except *: + self.kernel = kernel + + def __repr__(self): + return ("ScalarKernel<{}>" + .format(frombytes(self.kernel.signature.get().ToString()))) + + +cdef class VectorKernel(Kernel): + cdef const CVectorKernel* kernel + + cdef void init(self, const CVectorKernel* kernel) except *: + self.kernel = kernel + + def __repr__(self): + return ("VectorKernel<{}>" + .format(frombytes(self.kernel.signature.get().ToString()))) + + +cdef class ScalarAggregateKernel(Kernel): + cdef const CScalarAggregateKernel* kernel + + cdef void init(self, const CScalarAggregateKernel* kernel) except *: + self.kernel = kernel + + def __repr__(self): + return ("ScalarAggregateKernel<{}>" + .format(frombytes(self.kernel.signature.get().ToString()))) + + +cdef class HashAggregateKernel(Kernel): + cdef const CHashAggregateKernel* kernel + + cdef void init(self, const CHashAggregateKernel* kernel) except *: + self.kernel = kernel + + def __repr__(self): + return ("HashAggregateKernel<{}>" + .format(frombytes(self.kernel.signature.get().ToString()))) + + +FunctionDoc = namedtuple( + "FunctionDoc", + ("summary", "description", "arg_names", "options_class", + "options_required")) + + +cdef class Function(_Weakrefable): + """ + A compute function. + + A function implements a certain logical computation over a range of + possible input signatures. Each signature accepts a range of input + types and is implemented by a given Kernel. + + Functions can be of different kinds: + + * "scalar" functions apply an item-wise computation over all items + of their inputs. Each item in the output only depends on the values + of the inputs at the same position. Examples: addition, comparisons, + string predicates... + + * "vector" functions apply a collection-wise computation, such that + each item in the output may depend on the values of several items + in each input. Examples: dictionary encoding, sorting, extracting + unique values... + + * "scalar_aggregate" functions reduce the dimensionality of the inputs by + applying a reduction function. Examples: sum, min_max, mode... + + * "hash_aggregate" functions apply a reduction function to an input + subdivided by grouping criteria. They may not be directly called. + Examples: hash_sum, hash_min_max... + + * "meta" functions dispatch to other functions. + """ + + cdef: + shared_ptr[CFunction] sp_func + CFunction* base_func + + _kind_map = { + FunctionKind_SCALAR: "scalar", + FunctionKind_VECTOR: "vector", + FunctionKind_SCALAR_AGGREGATE: "scalar_aggregate", + FunctionKind_HASH_AGGREGATE: "hash_aggregate", + FunctionKind_META: "meta", + } + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly" + .format(self.__class__.__name__)) + + cdef void init(self, const shared_ptr[CFunction]& sp_func) except *: + self.sp_func = sp_func + self.base_func = sp_func.get() + + def __repr__(self): + return ("arrow.compute.Function" + .format(self.name, self.kind, self.arity, self.num_kernels)) + + def __reduce__(self): + # Reduction uses the global registry + return get_function, (self.name,) + + @property + def name(self): + """ + The function name. + """ + return frombytes(self.base_func.name()) + + @property + def arity(self): + """ + The function arity. + + If Ellipsis (i.e. `...`) is returned, the function takes a variable + number of arguments. + """ + cdef CArity arity = self.base_func.arity() + if arity.is_varargs: + return ... + else: + return arity.num_args + + @property + def kind(self): + """ + The function kind. + """ + cdef FunctionKind c_kind = self.base_func.kind() + try: + return self._kind_map[c_kind] + except KeyError: + raise NotImplementedError("Unknown Function::Kind") + + @property + def _doc(self): + """ + The C++-like function documentation (for internal use). + """ + cdef CFunctionDoc c_doc = self.base_func.doc() + return FunctionDoc(frombytes(c_doc.summary), + frombytes(c_doc.description), + [frombytes(s) for s in c_doc.arg_names], + frombytes(c_doc.options_class), + c_doc.options_required) + + @property + def num_kernels(self): + """ + The number of kernels implementing this function. + """ + return self.base_func.num_kernels() + + def call(self, args, FunctionOptions options=None, + MemoryPool memory_pool=None, length=None): + """ + Call the function on the given arguments. + + Parameters + ---------- + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. + options : FunctionOptions, optional + Options instance for executing this function. This should have + the right concrete options type. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If + not passed, will be inferred from passed data. + """ + cdef: + const CFunctionOptions* c_options = NULL + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + CExecContext c_exec_ctx = CExecContext(pool) + CExecBatch c_batch + CDatum result + + _pack_compute_args(args, &c_batch.values) + + if options is not None: + c_options = options.get_options() + + if length is not None: + c_batch.length = length + with nogil: + result = GetResultValue( + self.base_func.Execute(c_batch, c_options, &c_exec_ctx) + ) + else: + with nogil: + result = GetResultValue( + self.base_func.Execute(c_batch.values, c_options, + &c_exec_ctx) + ) + + return wrap_datum(result) + + +cdef class ScalarFunction(Function): + cdef const CScalarFunction* func + + cdef void init(self, const shared_ptr[CFunction]& sp_func) except *: + Function.init(self, sp_func) + self.func = sp_func.get() + + @property + def kernels(self): + """ + The kernels implementing this function. + """ + cdef vector[const CScalarKernel*] kernels = self.func.kernels() + return [wrap_scalar_kernel(k) for k in kernels] + + +cdef class VectorFunction(Function): + cdef const CVectorFunction* func + + cdef void init(self, const shared_ptr[CFunction]& sp_func) except *: + Function.init(self, sp_func) + self.func = sp_func.get() + + @property + def kernels(self): + """ + The kernels implementing this function. + """ + cdef vector[const CVectorKernel*] kernels = self.func.kernels() + return [wrap_vector_kernel(k) for k in kernels] + + +cdef class ScalarAggregateFunction(Function): + cdef const CScalarAggregateFunction* func + + cdef void init(self, const shared_ptr[CFunction]& sp_func) except *: + Function.init(self, sp_func) + self.func = sp_func.get() + + @property + def kernels(self): + """ + The kernels implementing this function. + """ + cdef vector[const CScalarAggregateKernel*] kernels = \ + self.func.kernels() + return [wrap_scalar_aggregate_kernel(k) for k in kernels] + + +cdef class HashAggregateFunction(Function): + cdef const CHashAggregateFunction* func + + cdef void init(self, const shared_ptr[CFunction]& sp_func) except *: + Function.init(self, sp_func) + self.func = sp_func.get() + + @property + def kernels(self): + """ + The kernels implementing this function. + """ + cdef vector[const CHashAggregateKernel*] kernels = self.func.kernels() + return [wrap_hash_aggregate_kernel(k) for k in kernels] + + +cdef class MetaFunction(Function): + cdef const CMetaFunction* func + + cdef void init(self, const shared_ptr[CFunction]& sp_func) except *: + Function.init(self, sp_func) + self.func = sp_func.get() + + # Since num_kernels is exposed, also expose a kernels property + @property + def kernels(self): + """ + The kernels implementing this function. + """ + return [] + + +cdef _pack_compute_args(object values, vector[CDatum]* out): + for val in values: + if isinstance(val, SUPPORTED_INPUT_ARR_TYPES): + val = lib.asarray(val) + + if isinstance(val, Array): + out.push_back(CDatum(( val).sp_array)) + continue + elif isinstance(val, ChunkedArray): + out.push_back(CDatum(( val).sp_chunked_array)) + continue + elif isinstance(val, Scalar): + out.push_back(CDatum(( val).unwrap())) + continue + elif isinstance(val, RecordBatch): + out.push_back(CDatum(( val).sp_batch)) + continue + elif isinstance(val, Table): + out.push_back(CDatum(( val).sp_table)) + continue + else: + # Is it a Python scalar? + try: + scal = lib.scalar(val) + except Exception: + # Raise dedicated error below + pass + else: + out.push_back(CDatum(( scal).unwrap())) + continue + + raise TypeError(f"Got unexpected argument type {type(val)} " + "for compute function") + + +cdef class FunctionRegistry(_Weakrefable): + cdef CFunctionRegistry* registry + + def __init__(self): + self.registry = GetFunctionRegistry() + + def list_functions(self): + """ + Return all function names in the registry. + """ + cdef vector[c_string] names = self.registry.GetFunctionNames() + return [frombytes(name) for name in names] + + def get_function(self, name): + """ + Look up a function by name in the registry. + + Parameters + ---------- + name : str + The name of the function to lookup + """ + cdef: + c_string c_name = tobytes(name) + shared_ptr[CFunction] func + with nogil: + func = GetResultValue(self.registry.GetFunction(c_name)) + return wrap_function(func) + + +cdef FunctionRegistry _global_func_registry = FunctionRegistry() + + +def function_registry(): + return _global_func_registry + + +def get_function(name): + """ + Get a function by name. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to lookup + """ + return _global_func_registry.get_function(name) + + +def list_functions(): + """ + Return all function names in the global registry. + """ + return _global_func_registry.list_functions() + + +def call_function(name, args, options=None, memory_pool=None, length=None): + """ + Call a named function. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to call. + args : list + The arguments to the function. + options : optional + options provided to the function. + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If not + passed, inferred from data. + """ + func = _global_func_registry.get_function(name) + return func.call(args, options=options, memory_pool=memory_pool, + length=length) + + +cdef class FunctionOptions(_Weakrefable): + __slots__ = () # avoid mistakingly creating attributes + + cdef const CFunctionOptions* get_options(self) except NULL: + return self.wrapped.get() + + cdef void init(self, const shared_ptr[CFunctionOptions]& sp): + self.wrapped = sp + + cdef inline shared_ptr[CFunctionOptions] unwrap(self): + return self.wrapped + + def serialize(self): + cdef: + CResult[shared_ptr[CBuffer]] res = self.get_options().Serialize() + shared_ptr[CBuffer] c_buf = GetResultValue(res) + return pyarrow_wrap_buffer(c_buf) + + @staticmethod + def deserialize(buf): + """ + Deserialize options for a function. + + Parameters + ---------- + buf : Buffer + The buffer containing the data to deserialize. + """ + cdef: + shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(buf) + CResult[unique_ptr[CFunctionOptions]] maybe_options = \ + DeserializeFunctionOptions(deref(c_buf)) + shared_ptr[CFunctionOptions] c_options + c_options = to_shared(GetResultValue(move(maybe_options))) + type_name = frombytes(c_options.get().options_type().type_name()) + module = globals() + if type_name not in module: + raise ValueError(f'Cannot deserialize "{type_name}"') + klass = module[type_name] + options = klass.__new__(klass) + ( options).init(c_options) + return options + + def __repr__(self): + type_name = self.__class__.__name__ + # Remove {} so we can use our own braces + string_repr = frombytes(self.get_options().ToString())[1:-1] + return f"{type_name}({string_repr})" + + def __eq__(self, FunctionOptions other): + return self.get_options().Equals(deref(other.get_options())) + + +def _raise_invalid_function_option(value, description, *, + exception_class=ValueError): + raise exception_class(f"\"{value}\" is not a valid {description}") + + +# NOTE: +# To properly expose the constructor signature of FunctionOptions +# subclasses, we use a two-level inheritance: +# 1. a C extension class that implements option validation and setting +# (won't expose function signatures because of +# https://github.com/cython/cython/issues/3873) +# 2. a Python derived class that implements the constructor + +cdef class _CastOptions(FunctionOptions): + cdef CCastOptions* options + + cdef void init(self, const shared_ptr[CFunctionOptions]& sp): + FunctionOptions.init(self, sp) + self.options = self.wrapped.get() + + def _set_options(self, DataType target_type, allow_int_overflow, + allow_time_truncate, allow_time_overflow, + allow_decimal_truncate, allow_float_truncate, + allow_invalid_utf8): + cdef: + shared_ptr[CCastOptions] wrapped = make_shared[CCastOptions]() + self.init( wrapped) + self._set_type(target_type) + if allow_int_overflow is not None: + self.allow_int_overflow = allow_int_overflow + if allow_time_truncate is not None: + self.allow_time_truncate = allow_time_truncate + if allow_time_overflow is not None: + self.allow_time_overflow = allow_time_overflow + if allow_decimal_truncate is not None: + self.allow_decimal_truncate = allow_decimal_truncate + if allow_float_truncate is not None: + self.allow_float_truncate = allow_float_truncate + if allow_invalid_utf8 is not None: + self.allow_invalid_utf8 = allow_invalid_utf8 + + def _set_type(self, target_type=None): + if target_type is not None: + deref(self.options).to_type = \ + ( ensure_type(target_type)).sp_type + + def _set_safe(self): + self.init(shared_ptr[CFunctionOptions]( + new CCastOptions(CCastOptions.Safe()))) + + def _set_unsafe(self): + self.init(shared_ptr[CFunctionOptions]( + new CCastOptions(CCastOptions.Unsafe()))) + + def is_safe(self): + return not (deref(self.options).allow_int_overflow or + deref(self.options).allow_time_truncate or + deref(self.options).allow_time_overflow or + deref(self.options).allow_decimal_truncate or + deref(self.options).allow_float_truncate or + deref(self.options).allow_invalid_utf8) + + @property + def allow_int_overflow(self): + return deref(self.options).allow_int_overflow + + @allow_int_overflow.setter + def allow_int_overflow(self, c_bool flag): + deref(self.options).allow_int_overflow = flag + + @property + def allow_time_truncate(self): + return deref(self.options).allow_time_truncate + + @allow_time_truncate.setter + def allow_time_truncate(self, c_bool flag): + deref(self.options).allow_time_truncate = flag + + @property + def allow_time_overflow(self): + return deref(self.options).allow_time_overflow + + @allow_time_overflow.setter + def allow_time_overflow(self, c_bool flag): + deref(self.options).allow_time_overflow = flag + + @property + def allow_decimal_truncate(self): + return deref(self.options).allow_decimal_truncate + + @allow_decimal_truncate.setter + def allow_decimal_truncate(self, c_bool flag): + deref(self.options).allow_decimal_truncate = flag + + @property + def allow_float_truncate(self): + return deref(self.options).allow_float_truncate + + @allow_float_truncate.setter + def allow_float_truncate(self, c_bool flag): + deref(self.options).allow_float_truncate = flag + + @property + def allow_invalid_utf8(self): + return deref(self.options).allow_invalid_utf8 + + @allow_invalid_utf8.setter + def allow_invalid_utf8(self, c_bool flag): + deref(self.options).allow_invalid_utf8 = flag + + +class CastOptions(_CastOptions): + """ + Options for the `cast` function. + + Parameters + ---------- + target_type : DataType, optional + The PyArrow type to cast to. + allow_int_overflow : bool, default False + Whether integer overflow is allowed when casting. + allow_time_truncate : bool, default False + Whether time precision truncation is allowed when casting. + allow_time_overflow : bool, default False + Whether date/time range overflow is allowed when casting. + allow_decimal_truncate : bool, default False + Whether decimal precision truncation is allowed when casting. + allow_float_truncate : bool, default False + Whether floating-point precision truncation is allowed when casting. + allow_invalid_utf8 : bool, default False + Whether producing invalid utf8 data is allowed when casting. + """ + + def __init__(self, target_type=None, *, allow_int_overflow=None, + allow_time_truncate=None, allow_time_overflow=None, + allow_decimal_truncate=None, allow_float_truncate=None, + allow_invalid_utf8=None): + self._set_options(target_type, allow_int_overflow, allow_time_truncate, + allow_time_overflow, allow_decimal_truncate, + allow_float_truncate, allow_invalid_utf8) + + @staticmethod + def safe(target_type=None): + """" + Create a CastOptions for a safe cast. + + Parameters + ---------- + target_type : optional + Target cast type for the safe cast. + """ + self = CastOptions() + self._set_safe() + self._set_type(target_type) + return self + + @staticmethod + def unsafe(target_type=None): + """" + Create a CastOptions for an unsafe cast. + + Parameters + ---------- + target_type : optional + Target cast type for the unsafe cast. + """ + self = CastOptions() + self._set_unsafe() + self._set_type(target_type) + return self + + +def _skip_nulls_doc(): + # (note the weird indent because of how the string is inserted + # by callers) + return """skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +""" + + +def _min_count_doc(*, default): + return f"""min_count : int, default {default} + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +""" + + +cdef class _ElementWiseAggregateOptions(FunctionOptions): + def _set_options(self, skip_nulls): + self.wrapped.reset(new CElementWiseAggregateOptions(skip_nulls)) + + +class ElementWiseAggregateOptions(_ElementWiseAggregateOptions): + __doc__ = f""" + Options for element-wise aggregate functions. + + Parameters + ---------- + {_skip_nulls_doc()} + """ + + def __init__(self, *, skip_nulls=True): + self._set_options(skip_nulls) + + +cdef CRoundMode unwrap_round_mode(round_mode) except *: + if round_mode == "down": + return CRoundMode_DOWN + elif round_mode == "up": + return CRoundMode_UP + elif round_mode == "towards_zero": + return CRoundMode_TOWARDS_ZERO + elif round_mode == "towards_infinity": + return CRoundMode_TOWARDS_INFINITY + elif round_mode == "half_down": + return CRoundMode_HALF_DOWN + elif round_mode == "half_up": + return CRoundMode_HALF_UP + elif round_mode == "half_towards_zero": + return CRoundMode_HALF_TOWARDS_ZERO + elif round_mode == "half_towards_infinity": + return CRoundMode_HALF_TOWARDS_INFINITY + elif round_mode == "half_to_even": + return CRoundMode_HALF_TO_EVEN + elif round_mode == "half_to_odd": + return CRoundMode_HALF_TO_ODD + _raise_invalid_function_option(round_mode, "round mode") + + +cdef class _RoundOptions(FunctionOptions): + def _set_options(self, ndigits, round_mode): + self.wrapped.reset( + new CRoundOptions(ndigits, unwrap_round_mode(round_mode)) + ) + + +class RoundOptions(_RoundOptions): + """ + Options for rounding numbers. + + Parameters + ---------- + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + + def __init__(self, ndigits=0, round_mode="half_to_even"): + self._set_options(ndigits, round_mode) + + +cdef class _RoundBinaryOptions(FunctionOptions): + def _set_options(self, round_mode): + self.wrapped.reset( + new CRoundBinaryOptions(unwrap_round_mode(round_mode)) + ) + + +class RoundBinaryOptions(_RoundBinaryOptions): + """ + Options for rounding numbers when ndigits is provided by a second array + + Parameters + ---------- + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + + def __init__(self, round_mode="half_to_even"): + self._set_options(round_mode) + + +cdef CCalendarUnit unwrap_round_temporal_unit(unit) except *: + if unit == "nanosecond": + return CCalendarUnit_NANOSECOND + elif unit == "microsecond": + return CCalendarUnit_MICROSECOND + elif unit == "millisecond": + return CCalendarUnit_MILLISECOND + elif unit == "second": + return CCalendarUnit_SECOND + elif unit == "minute": + return CCalendarUnit_MINUTE + elif unit == "hour": + return CCalendarUnit_HOUR + elif unit == "day": + return CCalendarUnit_DAY + elif unit == "week": + return CCalendarUnit_WEEK + elif unit == "month": + return CCalendarUnit_MONTH + elif unit == "quarter": + return CCalendarUnit_QUARTER + elif unit == "year": + return CCalendarUnit_YEAR + _raise_invalid_function_option(unit, "Calendar unit") + + +cdef class _RoundTemporalOptions(FunctionOptions): + def _set_options(self, multiple, unit, week_starts_monday, + ceil_is_strictly_greater, calendar_based_origin): + self.wrapped.reset( + new CRoundTemporalOptions( + multiple, unwrap_round_temporal_unit(unit), + week_starts_monday, ceil_is_strictly_greater, + calendar_based_origin) + ) + + +class RoundTemporalOptions(_RoundTemporalOptions): + """ + Options for rounding temporal values. + + Parameters + ---------- + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + + """ + + def __init__(self, multiple=1, unit="day", *, week_starts_monday=True, + ceil_is_strictly_greater=False, + calendar_based_origin=False): + self._set_options(multiple, unit, week_starts_monday, + ceil_is_strictly_greater, + calendar_based_origin) + + +cdef class _RoundToMultipleOptions(FunctionOptions): + def _set_options(self, multiple, round_mode): + if not isinstance(multiple, Scalar): + try: + multiple = lib.scalar(multiple) + except Exception: + _raise_invalid_function_option( + multiple, "multiple type for RoundToMultipleOptions", + exception_class=TypeError) + + self.wrapped.reset( + new CRoundToMultipleOptions( + pyarrow_unwrap_scalar(multiple), unwrap_round_mode(round_mode)) + ) + + +class RoundToMultipleOptions(_RoundToMultipleOptions): + """ + Options for rounding numbers to a multiple. + + Parameters + ---------- + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + + def __init__(self, multiple=1.0, round_mode="half_to_even"): + self._set_options(multiple, round_mode) + + +cdef class _JoinOptions(FunctionOptions): + _null_handling_map = { + "emit_null": CJoinNullHandlingBehavior_EMIT_NULL, + "skip": CJoinNullHandlingBehavior_SKIP, + "replace": CJoinNullHandlingBehavior_REPLACE, + } + + def _set_options(self, null_handling, null_replacement): + try: + self.wrapped.reset( + new CJoinOptions(self._null_handling_map[null_handling], + tobytes(null_replacement)) + ) + except KeyError: + _raise_invalid_function_option(null_handling, "null handling") + + +class JoinOptions(_JoinOptions): + """ + Options for the `binary_join_element_wise` function. + + Parameters + ---------- + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + """ + + def __init__(self, null_handling="emit_null", null_replacement=""): + self._set_options(null_handling, null_replacement) + + +cdef class _MatchSubstringOptions(FunctionOptions): + def _set_options(self, pattern, ignore_case): + self.wrapped.reset( + new CMatchSubstringOptions(tobytes(pattern), ignore_case) + ) + + +class MatchSubstringOptions(_MatchSubstringOptions): + """ + Options for looking for a substring. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + """ + + def __init__(self, pattern, *, ignore_case=False): + self._set_options(pattern, ignore_case) + + +cdef class _PadOptions(FunctionOptions): + def _set_options(self, width, padding, lean_left_on_odd_padding): + self.wrapped.reset(new CPadOptions(width, tobytes(padding), lean_left_on_odd_padding)) + + +class PadOptions(_PadOptions): + """ + Options for padding strings. + + Parameters + ---------- + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + """ + + def __init__(self, width, padding=' ', lean_left_on_odd_padding=True): + self._set_options(width, padding, lean_left_on_odd_padding) + + +cdef class _TrimOptions(FunctionOptions): + def _set_options(self, characters): + self.wrapped.reset(new CTrimOptions(tobytes(characters))) + + +class TrimOptions(_TrimOptions): + """ + Options for trimming characters from strings. + + Parameters + ---------- + characters : str + Individual characters to be trimmed from the string. + """ + + def __init__(self, characters): + self._set_options(tobytes(characters)) + + +cdef class _ReplaceSubstringOptions(FunctionOptions): + def _set_options(self, pattern, replacement, max_replacements): + self.wrapped.reset( + new CReplaceSubstringOptions(tobytes(pattern), + tobytes(replacement), + max_replacements) + ) + + +class ReplaceSubstringOptions(_ReplaceSubstringOptions): + """ + Options for replacing matched substrings. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + """ + + def __init__(self, pattern, replacement, *, max_replacements=None): + if max_replacements is None: + max_replacements = -1 + self._set_options(pattern, replacement, max_replacements) + + +cdef class _ExtractRegexOptions(FunctionOptions): + def _set_options(self, pattern): + self.wrapped.reset(new CExtractRegexOptions(tobytes(pattern))) + + +class ExtractRegexOptions(_ExtractRegexOptions): + """ + Options for the `extract_regex` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + + def __init__(self, pattern): + self._set_options(pattern) + + +cdef class _SliceOptions(FunctionOptions): + def _set_options(self, start, stop, step): + self.wrapped.reset(new CSliceOptions(start, stop, step)) + + +class SliceOptions(_SliceOptions): + """ + Options for slicing. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + """ + + def __init__(self, start, stop=None, step=1): + if stop is None: + stop = sys.maxsize + if step < 0: + stop = -stop + self._set_options(start, stop, step) + + +cdef class _ListSliceOptions(FunctionOptions): + cpdef _set_options(self, start, stop=None, step=1, return_fixed_size_list=None): + cdef: + CListSliceOptions* opts + opts = new CListSliceOptions( + start, + nullopt if stop is None + else (stop), + step, + nullopt if return_fixed_size_list is None + else (return_fixed_size_list) + ) + self.wrapped.reset(opts) + + +class ListSliceOptions(_ListSliceOptions): + """ + Options for list array slicing. + + Parameters + ---------- + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + """ + + def __init__(self, start, stop=None, step=1, return_fixed_size_list=None): + self._set_options(start, stop, step, return_fixed_size_list) + + +cdef class _ReplaceSliceOptions(FunctionOptions): + def _set_options(self, start, stop, replacement): + self.wrapped.reset( + new CReplaceSliceOptions(start, stop, tobytes(replacement)) + ) + + +class ReplaceSliceOptions(_ReplaceSliceOptions): + """ + Options for replacing slices. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + """ + + def __init__(self, start, stop, replacement): + self._set_options(start, stop, replacement) + + +cdef class _FilterOptions(FunctionOptions): + _null_selection_map = { + "drop": CFilterNullSelectionBehavior_DROP, + "emit_null": CFilterNullSelectionBehavior_EMIT_NULL, + } + + def _set_options(self, null_selection_behavior): + try: + self.wrapped.reset( + new CFilterOptions( + self._null_selection_map[null_selection_behavior] + ) + ) + except KeyError: + _raise_invalid_function_option(null_selection_behavior, + "null selection behavior") + + +class FilterOptions(_FilterOptions): + """ + Options for selecting with a boolean filter. + + Parameters + ---------- + null_selection_behavior : str, default "drop" + How to handle nulls in the selection filter. + Accepted values are "drop", "emit_null". + """ + + def __init__(self, null_selection_behavior="drop"): + self._set_options(null_selection_behavior) + + +cdef class _DictionaryEncodeOptions(FunctionOptions): + _null_encoding_map = { + "encode": CDictionaryEncodeNullEncodingBehavior_ENCODE, + "mask": CDictionaryEncodeNullEncodingBehavior_MASK, + } + + def _set_options(self, null_encoding): + try: + self.wrapped.reset( + new CDictionaryEncodeOptions( + self._null_encoding_map[null_encoding] + ) + ) + except KeyError: + _raise_invalid_function_option(null_encoding, "null encoding") + + +class DictionaryEncodeOptions(_DictionaryEncodeOptions): + """ + Options for dictionary encoding. + + Parameters + ---------- + null_encoding : str, default "mask" + How to encode nulls in the input. + Accepted values are "mask" (null inputs emit a null in the indices + array), "encode" (null inputs emit a non-null index pointing to + a null value in the dictionary array). + """ + + def __init__(self, null_encoding="mask"): + self._set_options(null_encoding) + + +cdef class _RunEndEncodeOptions(FunctionOptions): + def _set_options(self, run_end_type): + run_end_ty = ensure_type(run_end_type) + self.wrapped.reset(new CRunEndEncodeOptions(pyarrow_unwrap_data_type(run_end_ty))) + + +class RunEndEncodeOptions(_RunEndEncodeOptions): + """ + Options for run-end encoding. + + Parameters + ---------- + run_end_type : DataType, default pyarrow.int32() + The data type of the run_ends array. + + Accepted values are pyarrow.{int16(), int32(), int64()}. + """ + + def __init__(self, run_end_type=lib.int32()): + self._set_options(run_end_type) + + +cdef class _TakeOptions(FunctionOptions): + def _set_options(self, boundscheck): + self.wrapped.reset(new CTakeOptions(boundscheck)) + + +class TakeOptions(_TakeOptions): + """ + Options for the `take` and `array_take` functions. + + Parameters + ---------- + boundscheck : boolean, default True + Whether to check indices are within bounds. If False and an + index is out of bounds, behavior is undefined (the process + may crash). + """ + + def __init__(self, *, boundscheck=True): + self._set_options(boundscheck) + + +cdef class _MakeStructOptions(FunctionOptions): + def _set_options(self, field_names, field_nullability, field_metadata): + cdef: + vector[c_string] c_field_names + vector[shared_ptr[const CKeyValueMetadata]] c_field_metadata + for name in field_names: + c_field_names.push_back(tobytes(name)) + for metadata in field_metadata: + c_field_metadata.push_back(pyarrow_unwrap_metadata(metadata)) + self.wrapped.reset( + new CMakeStructOptions(c_field_names, field_nullability, + c_field_metadata) + ) + + +class MakeStructOptions(_MakeStructOptions): + """ + Options for the `make_struct` function. + + Parameters + ---------- + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + """ + + def __init__(self, field_names=(), *, field_nullability=None, + field_metadata=None): + if field_nullability is None: + field_nullability = [True] * len(field_names) + if field_metadata is None: + field_metadata = [None] * len(field_names) + self._set_options(field_names, field_nullability, field_metadata) + + +cdef CFieldRef _ensure_field_ref(value) except *: + cdef: + CFieldRef field_ref + const CFieldRef* field_ref_ptr + + if isinstance(value, (list, tuple)): + value = Expression._nested_field(tuple(value)) + + if isinstance(value, Expression): + field_ref_ptr = (value).unwrap().field_ref() + if field_ref_ptr is NULL: + raise ValueError("Unable to get FieldRef from Expression") + field_ref = deref(field_ref_ptr) + elif isinstance(value, (bytes, str)): + if value.startswith(b'.' if isinstance(value, bytes) else '.'): + field_ref = GetResultValue( + CFieldRef.FromDotPath(tobytes(value))) + else: + field_ref = CFieldRef(tobytes(value)) + elif isinstance(value, int): + field_ref = CFieldRef( value) + else: + raise TypeError("Expected a field reference as a str or int, list of " + f"str or int, or Expression. Got {type(value)} instead.") + return field_ref + + +cdef class _StructFieldOptions(FunctionOptions): + def _set_options(self, indices): + + if isinstance(indices, (list, tuple)) and not len(indices): + # Allow empty indices; effectively return same array + self.wrapped.reset( + new CStructFieldOptions(indices)) + return + + cdef CFieldRef field_ref = _ensure_field_ref(indices) + self.wrapped.reset(new CStructFieldOptions(field_ref)) + + +class StructFieldOptions(_StructFieldOptions): + """ + Options for the `struct_field` function. + + Parameters + ---------- + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + """ + + def __init__(self, indices): + self._set_options(indices) + + +cdef class _ScalarAggregateOptions(FunctionOptions): + def _set_options(self, skip_nulls, min_count): + self.wrapped.reset(new CScalarAggregateOptions(skip_nulls, min_count)) + + +class ScalarAggregateOptions(_ScalarAggregateOptions): + __doc__ = f""" + Options for scalar aggregations. + + Parameters + ---------- + {_skip_nulls_doc()} + {_min_count_doc(default=1)} + """ + + def __init__(self, *, skip_nulls=True, min_count=1): + self._set_options(skip_nulls, min_count) + + +cdef class _CountOptions(FunctionOptions): + _mode_map = { + "only_valid": CCountMode_ONLY_VALID, + "only_null": CCountMode_ONLY_NULL, + "all": CCountMode_ALL, + } + + def _set_options(self, mode): + try: + self.wrapped.reset(new CCountOptions(self._mode_map[mode])) + except KeyError: + _raise_invalid_function_option(mode, "count mode") + + +class CountOptions(_CountOptions): + """ + Options for the `count` function. + + Parameters + ---------- + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + """ + + def __init__(self, mode="only_valid"): + self._set_options(mode) + + +cdef class _IndexOptions(FunctionOptions): + def _set_options(self, scalar): + self.wrapped.reset(new CIndexOptions(pyarrow_unwrap_scalar(scalar))) + + +class IndexOptions(_IndexOptions): + """ + Options for the `index` function. + + Parameters + ---------- + value : Scalar + The value to search for. + """ + + def __init__(self, value): + self._set_options(value) + + +cdef class _MapLookupOptions(FunctionOptions): + _occurrence_map = { + "all": CMapLookupOccurrence_ALL, + "first": CMapLookupOccurrence_FIRST, + "last": CMapLookupOccurrence_LAST, + } + + def _set_options(self, query_key, occurrence): + try: + self.wrapped.reset( + new CMapLookupOptions( + pyarrow_unwrap_scalar(query_key), + self._occurrence_map[occurrence] + ) + ) + except KeyError: + _raise_invalid_function_option(occurrence, + "Should either be first, last, or all") + + +class MapLookupOptions(_MapLookupOptions): + """ + Options for the `map_lookup` function. + + Parameters + ---------- + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + """ + + def __init__(self, query_key, occurrence): + if not isinstance(query_key, lib.Scalar): + query_key = lib.scalar(query_key) + + self._set_options(query_key, occurrence) + + +cdef class _ModeOptions(FunctionOptions): + def _set_options(self, n, skip_nulls, min_count): + self.wrapped.reset(new CModeOptions(n, skip_nulls, min_count)) + + +class ModeOptions(_ModeOptions): + __doc__ = f""" + Options for the `mode` function. + + Parameters + ---------- + n : int, default 1 + Number of distinct most-common values to return. + {_skip_nulls_doc()} + {_min_count_doc(default=0)} + """ + + def __init__(self, n=1, *, skip_nulls=True, min_count=0): + self._set_options(n, skip_nulls, min_count) + + +cdef class _SetLookupOptions(FunctionOptions): + def _set_options(self, value_set, c_bool skip_nulls): + cdef unique_ptr[CDatum] valset + if isinstance(value_set, Array): + valset.reset(new CDatum(( value_set).sp_array)) + elif isinstance(value_set, ChunkedArray): + valset.reset( + new CDatum(( value_set).sp_chunked_array) + ) + elif isinstance(value_set, Scalar): + valset.reset(new CDatum(( value_set).unwrap())) + else: + _raise_invalid_function_option(value_set, "value set", + exception_class=TypeError) + + self.wrapped.reset(new CSetLookupOptions(deref(valset), skip_nulls)) + + +class SetLookupOptions(_SetLookupOptions): + """ + Options for the `is_in` and `index_in` functions. + + Parameters + ---------- + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + """ + + def __init__(self, value_set, *, skip_nulls=False): + self._set_options(value_set, skip_nulls) + + +cdef class _StrptimeOptions(FunctionOptions): + _unit_map = { + "s": TimeUnit_SECOND, + "ms": TimeUnit_MILLI, + "us": TimeUnit_MICRO, + "ns": TimeUnit_NANO, + } + + def _set_options(self, format, unit, error_is_null): + try: + self.wrapped.reset( + new CStrptimeOptions(tobytes(format), self._unit_map[unit], + error_is_null) + ) + except KeyError: + _raise_invalid_function_option(unit, "time unit") + + +class StrptimeOptions(_StrptimeOptions): + """ + Options for the `strptime` function. + + Parameters + ---------- + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + """ + + def __init__(self, format, unit, error_is_null=False): + self._set_options(format, unit, error_is_null) + + +cdef class _StrftimeOptions(FunctionOptions): + def _set_options(self, format, locale): + self.wrapped.reset( + new CStrftimeOptions(tobytes(format), tobytes(locale)) + ) + + +class StrftimeOptions(_StrftimeOptions): + """ + Options for the `strftime` function. + + Parameters + ---------- + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + """ + + def __init__(self, format="%Y-%m-%dT%H:%M:%S", locale="C"): + self._set_options(format, locale) + + +cdef class _DayOfWeekOptions(FunctionOptions): + def _set_options(self, count_from_zero, week_start): + self.wrapped.reset( + new CDayOfWeekOptions(count_from_zero, week_start) + ) + + +class DayOfWeekOptions(_DayOfWeekOptions): + """ + Options for the `day_of_week` function. + + Parameters + ---------- + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + """ + + def __init__(self, *, count_from_zero=True, week_start=1): + self._set_options(count_from_zero, week_start) + + +cdef class _WeekOptions(FunctionOptions): + def _set_options(self, week_starts_monday, count_from_zero, + first_week_is_fully_in_year): + self.wrapped.reset( + new CWeekOptions(week_starts_monday, count_from_zero, + first_week_is_fully_in_year) + ) + + +class WeekOptions(_WeekOptions): + """ + Options for the `week` function. + + Parameters + ---------- + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + """ + + def __init__(self, *, week_starts_monday=True, count_from_zero=False, + first_week_is_fully_in_year=False): + self._set_options(week_starts_monday, + count_from_zero, first_week_is_fully_in_year) + + +cdef class _AssumeTimezoneOptions(FunctionOptions): + _ambiguous_map = { + "raise": CAssumeTimezoneAmbiguous_AMBIGUOUS_RAISE, + "earliest": CAssumeTimezoneAmbiguous_AMBIGUOUS_EARLIEST, + "latest": CAssumeTimezoneAmbiguous_AMBIGUOUS_LATEST, + } + _nonexistent_map = { + "raise": CAssumeTimezoneNonexistent_NONEXISTENT_RAISE, + "earliest": CAssumeTimezoneNonexistent_NONEXISTENT_EARLIEST, + "latest": CAssumeTimezoneNonexistent_NONEXISTENT_LATEST, + } + + def _set_options(self, timezone, ambiguous, nonexistent): + if ambiguous not in self._ambiguous_map: + _raise_invalid_function_option(ambiguous, + "'ambiguous' timestamp handling") + if nonexistent not in self._nonexistent_map: + _raise_invalid_function_option(nonexistent, + "'nonexistent' timestamp handling") + self.wrapped.reset( + new CAssumeTimezoneOptions(tobytes(timezone), + self._ambiguous_map[ambiguous], + self._nonexistent_map[nonexistent]) + ) + + +class AssumeTimezoneOptions(_AssumeTimezoneOptions): + """ + Options for the `assume_timezone` function. + + Parameters + ---------- + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + """ + + def __init__(self, timezone, *, ambiguous="raise", nonexistent="raise"): + self._set_options(timezone, ambiguous, nonexistent) + + +cdef class _NullOptions(FunctionOptions): + def _set_options(self, nan_is_null): + self.wrapped.reset(new CNullOptions(nan_is_null)) + + +class NullOptions(_NullOptions): + """ + Options for the `is_null` function. + + Parameters + ---------- + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + """ + + def __init__(self, *, nan_is_null=False): + self._set_options(nan_is_null) + + +cdef class _VarianceOptions(FunctionOptions): + def _set_options(self, ddof, skip_nulls, min_count): + self.wrapped.reset(new CVarianceOptions(ddof, skip_nulls, min_count)) + + +class VarianceOptions(_VarianceOptions): + __doc__ = f""" + Options for the `variance` and `stddev` functions. + + Parameters + ---------- + ddof : int, default 0 + Number of degrees of freedom. + {_skip_nulls_doc()} + {_min_count_doc(default=0)} + """ + + def __init__(self, *, ddof=0, skip_nulls=True, min_count=0): + self._set_options(ddof, skip_nulls, min_count) + + +cdef class _SplitOptions(FunctionOptions): + def _set_options(self, max_splits, reverse): + self.wrapped.reset(new CSplitOptions(max_splits, reverse)) + + +class SplitOptions(_SplitOptions): + """ + Options for splitting on whitespace. + + Parameters + ---------- + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + + def __init__(self, *, max_splits=None, reverse=False): + if max_splits is None: + max_splits = -1 + self._set_options(max_splits, reverse) + + +cdef class _SplitPatternOptions(FunctionOptions): + def _set_options(self, pattern, max_splits, reverse): + self.wrapped.reset( + new CSplitPatternOptions(tobytes(pattern), max_splits, reverse) + ) + + +class SplitPatternOptions(_SplitPatternOptions): + """ + Options for splitting on a string pattern. + + Parameters + ---------- + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + + def __init__(self, pattern, *, max_splits=None, reverse=False): + if max_splits is None: + max_splits = -1 + self._set_options(pattern, max_splits, reverse) + + +cdef CSortOrder unwrap_sort_order(order) except *: + if order == "ascending": + return CSortOrder_Ascending + elif order == "descending": + return CSortOrder_Descending + _raise_invalid_function_option(order, "sort order") + + +cdef CNullPlacement unwrap_null_placement(null_placement) except *: + if null_placement == "at_start": + return CNullPlacement_AtStart + elif null_placement == "at_end": + return CNullPlacement_AtEnd + _raise_invalid_function_option(null_placement, "null placement") + + +cdef class _PartitionNthOptions(FunctionOptions): + def _set_options(self, pivot, null_placement): + self.wrapped.reset(new CPartitionNthOptions( + pivot, unwrap_null_placement(null_placement))) + + +class PartitionNthOptions(_PartitionNthOptions): + """ + Options for the `partition_nth_indices` function. + + Parameters + ---------- + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + """ + + def __init__(self, pivot, *, null_placement="at_end"): + self._set_options(pivot, null_placement) + + +cdef class _CumulativeOptions(FunctionOptions): + def _set_options(self, start, skip_nulls): + if start is None: + self.wrapped.reset(new CCumulativeOptions(skip_nulls)) + elif isinstance(start, Scalar): + self.wrapped.reset(new CCumulativeOptions( + pyarrow_unwrap_scalar(start), skip_nulls)) + else: + try: + start = lib.scalar(start) + self.wrapped.reset(new CCumulativeOptions( + pyarrow_unwrap_scalar(start), skip_nulls)) + except Exception: + _raise_invalid_function_option( + start, "`start` type for CumulativeOptions", TypeError) + + +class CumulativeOptions(_CumulativeOptions): + """ + Options for `cumulative_*` functions. + + - cumulative_sum + - cumulative_sum_checked + - cumulative_prod + - cumulative_prod_checked + - cumulative_max + - cumulative_min + + Parameters + ---------- + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + + def __init__(self, start=None, *, skip_nulls=False): + self._set_options(start, skip_nulls) + + +class CumulativeSumOptions(_CumulativeOptions): + """ + Options for `cumulative_sum` function. + + Parameters + ---------- + start : Scalar, default None + Starting value for sum computation + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + + def __init__(self, start=None, *, skip_nulls=False): + warnings.warn( + _DEPR_MSG.format("CumulativeSumOptions", "14.0", "CumulativeOptions"), + FutureWarning, + stacklevel=2 + ) + self._set_options(start, skip_nulls) + + +cdef class _PairwiseOptions(FunctionOptions): + def _set_options(self, period): + self.wrapped.reset(new CPairwiseOptions(period)) + + +class PairwiseOptions(_PairwiseOptions): + """ + Options for `pairwise` functions. + + Parameters + ---------- + period : int, default 1 + Period for applying the period function. + """ + + def __init__(self, period=1): + self._set_options(period) + + +cdef class _ListFlattenOptions(FunctionOptions): + def _set_options(self, recursive): + self.wrapped.reset(new CListFlattenOptions(recursive)) + + +class ListFlattenOptions(_ListFlattenOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + + def __init__(self, recursive=False): + self._set_options(recursive) + + +cdef class _ArraySortOptions(FunctionOptions): + def _set_options(self, order, null_placement): + self.wrapped.reset(new CArraySortOptions( + unwrap_sort_order(order), unwrap_null_placement(null_placement))) + + +class ArraySortOptions(_ArraySortOptions): + """ + Options for the `array_sort_indices` function. + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + """ + + def __init__(self, order="ascending", *, null_placement="at_end"): + self._set_options(order, null_placement) + + +cdef class _SortOptions(FunctionOptions): + def _set_options(self, sort_keys, null_placement): + cdef vector[CSortKey] c_sort_keys + for name, order in sort_keys: + c_sort_keys.push_back( + CSortKey(_ensure_field_ref(name), unwrap_sort_order(order)) + ) + self.wrapped.reset(new CSortOptions( + c_sort_keys, unwrap_null_placement(null_placement))) + + +class SortOptions(_SortOptions): + """ + Options for the `sort_indices` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + """ + + def __init__(self, sort_keys=(), *, null_placement="at_end"): + self._set_options(sort_keys, null_placement) + + +cdef class _SelectKOptions(FunctionOptions): + def _set_options(self, k, sort_keys): + cdef vector[CSortKey] c_sort_keys + for name, order in sort_keys: + c_sort_keys.push_back( + CSortKey(_ensure_field_ref(name), unwrap_sort_order(order)) + ) + self.wrapped.reset(new CSelectKOptions(k, c_sort_keys)) + + +class SelectKOptions(_SelectKOptions): + """ + Options for top/bottom k-selection. + + Parameters + ---------- + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + """ + + def __init__(self, k, sort_keys): + self._set_options(k, sort_keys) + + +cdef class _QuantileOptions(FunctionOptions): + _interp_map = { + "linear": CQuantileInterp_LINEAR, + "lower": CQuantileInterp_LOWER, + "higher": CQuantileInterp_HIGHER, + "nearest": CQuantileInterp_NEAREST, + "midpoint": CQuantileInterp_MIDPOINT, + } + + def _set_options(self, quantiles, interp, skip_nulls, min_count): + try: + self.wrapped.reset( + new CQuantileOptions(quantiles, self._interp_map[interp], + skip_nulls, min_count) + ) + except KeyError: + _raise_invalid_function_option(interp, "quantile interpolation") + + +class QuantileOptions(_QuantileOptions): + __doc__ = f""" + Options for the `quantile` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + {_skip_nulls_doc()} + {_min_count_doc(default=0)} + """ + + def __init__(self, q=0.5, *, interpolation="linear", skip_nulls=True, + min_count=0): + if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES): + q = [q] + self._set_options(q, interpolation, skip_nulls, min_count) + + +cdef class _TDigestOptions(FunctionOptions): + def _set_options(self, quantiles, delta, buffer_size, skip_nulls, + min_count): + self.wrapped.reset( + new CTDigestOptions(quantiles, delta, buffer_size, skip_nulls, + min_count) + ) + + +class TDigestOptions(_TDigestOptions): + __doc__ = f""" + Options for the `tdigest` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + {_skip_nulls_doc()} + {_min_count_doc(default=0)} + """ + + def __init__(self, q=0.5, *, delta=100, buffer_size=500, skip_nulls=True, + min_count=0): + if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES): + q = [q] + self._set_options(q, delta, buffer_size, skip_nulls, min_count) + + +cdef class _Utf8NormalizeOptions(FunctionOptions): + _form_map = { + "NFC": CUtf8NormalizeForm_NFC, + "NFKC": CUtf8NormalizeForm_NFKC, + "NFD": CUtf8NormalizeForm_NFD, + "NFKD": CUtf8NormalizeForm_NFKD, + } + + def _set_options(self, form): + try: + self.wrapped.reset( + new CUtf8NormalizeOptions(self._form_map[form]) + ) + except KeyError: + _raise_invalid_function_option(form, + "Unicode normalization form") + + +class Utf8NormalizeOptions(_Utf8NormalizeOptions): + """ + Options for the `utf8_normalize` function. + + Parameters + ---------- + form : str + Unicode normalization form. + Accepted values are "NFC", "NFKC", "NFD", NFKD". + """ + + def __init__(self, form): + self._set_options(form) + + +cdef class _RandomOptions(FunctionOptions): + def _set_options(self, initializer): + if initializer == 'system': + self.wrapped.reset(new CRandomOptions( + CRandomOptions.FromSystemRandom())) + return + + if not isinstance(initializer, int): + try: + initializer = hash(initializer) + except TypeError: + raise TypeError( + f"initializer should be 'system', an integer, " + f"or a hashable object; got {initializer!r}") + + if initializer < 0: + initializer += 2**64 + self.wrapped.reset(new CRandomOptions( + CRandomOptions.FromSeed(initializer))) + + +class RandomOptions(_RandomOptions): + """ + Options for random generation. + + Parameters + ---------- + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + """ + + def __init__(self, *, initializer='system'): + self._set_options(initializer) + + +cdef class _RankOptions(FunctionOptions): + + _tiebreaker_map = { + "min": CRankOptionsTiebreaker_Min, + "max": CRankOptionsTiebreaker_Max, + "first": CRankOptionsTiebreaker_First, + "dense": CRankOptionsTiebreaker_Dense, + } + + def _set_options(self, sort_keys, null_placement, tiebreaker): + cdef vector[CSortKey] c_sort_keys + if isinstance(sort_keys, str): + c_sort_keys.push_back( + CSortKey(_ensure_field_ref(""), unwrap_sort_order(sort_keys)) + ) + else: + for name, order in sort_keys: + c_sort_keys.push_back( + CSortKey(_ensure_field_ref(name), unwrap_sort_order(order)) + ) + try: + self.wrapped.reset( + new CRankOptions(c_sort_keys, + unwrap_null_placement(null_placement), + self._tiebreaker_map[tiebreaker]) + ) + except KeyError: + _raise_invalid_function_option(tiebreaker, "tiebreaker") + + +class RankOptions(_RankOptions): + """ + Options for the `rank` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + """ + + def __init__(self, sort_keys="ascending", *, null_placement="at_end", tiebreaker="first"): + self._set_options(sort_keys, null_placement, tiebreaker) + + +cdef class Expression(_Weakrefable): + """ + A logical expression to be evaluated against some input. + + To create an expression: + + - Use the factory function ``pyarrow.compute.scalar()`` to create a + scalar (not necessary when combined, see example below). + - Use the factory function ``pyarrow.compute.field()`` to reference + a field (column in table). + - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. + - Combine expressions using python operators ``&`` (logical and), + ``|`` (logical or) and ``~`` (logical not). + Note: python keywords ``and``, ``or`` and ``not`` cannot be used + to combine expressions. + - Create expression predicates using Expression methods such as + ``pyarrow.compute.Expression.isin()``. + + Examples + -------- + + >>> import pyarrow.compute as pc + >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) + 7))> + >>> pc.field('a') != 3 + + >>> pc.field('a').isin([1, 2, 3]) + + """ + + def __init__(self): + msg = 'Expression is an abstract class thus cannot be initialized.' + raise TypeError(msg) + + cdef void init(self, const CExpression& sp): + self.expr = sp + + @staticmethod + cdef wrap(const CExpression& sp): + cdef Expression self = Expression.__new__(Expression) + self.init(sp) + return self + + cdef inline CExpression unwrap(self): + return self.expr + + def equals(self, Expression other): + """ + Parameters + ---------- + other : pyarrow.dataset.Expression + + Returns + ------- + bool + """ + return self.expr.Equals(other.unwrap()) + + def __str__(self): + return frombytes(self.expr.ToString()) + + def __repr__(self): + return "".format( + self.__class__.__name__, str(self) + ) + + @staticmethod + def from_substrait(object buffer not None): + """ + Deserialize an expression from Substrait + + The serialized message must be an ExtendedExpression message that has + only a single expression. The name of the expression and the schema + the expression was bound to will be ignored. Use + pyarrow.substrait.deserialize_expressions if this information is needed + or if the message might contain multiple expressions. + + Parameters + ---------- + buffer : bytes or Buffer + The Substrait message to deserialize + + Returns + ------- + Expression + The deserialized expression + """ + expressions = _pas().deserialize_expressions(buffer).expressions + if len(expressions) == 0: + raise ValueError("Substrait message did not contain any expressions") + if len(expressions) > 1: + raise ValueError( + "Substrait message contained multiple expressions. Use pyarrow.substrait.deserialize_expressions instead") + return next(iter(expressions.values())) + + def to_substrait(self, Schema schema not None, c_bool allow_arrow_extensions=False): + """ + Serialize the expression using Substrait + + The expression will be serialized as an ExtendedExpression message that has a + single expression named "expression" + + Parameters + ---------- + schema : Schema + The input schema the expression will be bound to + allow_arrow_extensions : bool, default False + If False then only functions that are part of the core Substrait function + definitions will be allowed. Set this to True to allow pyarrow-specific functions + but the result may not be accepted by other compute libraries. + + Returns + ------- + Buffer + A buffer containing the serialized Protobuf plan. + """ + return _pas().serialize_expressions([self], ["expression"], schema, allow_arrow_extensions=allow_arrow_extensions) + + @staticmethod + def _deserialize(Buffer buffer not None): + return Expression.wrap(GetResultValue(CDeserializeExpression( + pyarrow_unwrap_buffer(buffer)))) + + def __reduce__(self): + buffer = pyarrow_wrap_buffer(GetResultValue( + CSerializeExpression(self.expr))) + return Expression._deserialize, (buffer,) + + @staticmethod + cdef Expression _expr_or_scalar(object expr): + if isinstance(expr, Expression): + return ( expr) + return ( Expression._scalar(expr)) + + @staticmethod + def _call(str function_name, list arguments, FunctionOptions options=None): + cdef: + vector[CExpression] c_arguments + shared_ptr[CFunctionOptions] c_options + + for argument in arguments: + if not isinstance(argument, Expression): + # Attempt to help convert this to an expression + try: + argument = Expression._scalar(argument) + except ArrowInvalid: + raise TypeError( + "only other expressions allowed as arguments") + c_arguments.push_back(( argument).expr) + + if options is not None: + c_options = options.unwrap() + + return Expression.wrap(CMakeCallExpression( + tobytes(function_name), move(c_arguments), c_options)) + + def __richcmp__(self, other, int op): + other = Expression._expr_or_scalar(other) + return Expression._call({ + Py_EQ: "equal", + Py_NE: "not_equal", + Py_GT: "greater", + Py_GE: "greater_equal", + Py_LT: "less", + Py_LE: "less_equal", + }[op], [self, other]) + + def __bool__(self): + raise ValueError( + "An Expression cannot be evaluated to python True or False. " + "If you are using the 'and', 'or' or 'not' operators, use '&', " + "'|' or '~' instead." + ) + + def __invert__(self): + return Expression._call("invert", [self]) + + def __and__(Expression self, other): + other = Expression._expr_or_scalar(other) + return Expression._call("and_kleene", [self, other]) + + def __or__(Expression self, other): + other = Expression._expr_or_scalar(other) + return Expression._call("or_kleene", [self, other]) + + def __add__(Expression self, other): + other = Expression._expr_or_scalar(other) + return Expression._call("add_checked", [self, other]) + + def __mul__(Expression self, other): + other = Expression._expr_or_scalar(other) + return Expression._call("multiply_checked", [self, other]) + + def __sub__(Expression self, other): + other = Expression._expr_or_scalar(other) + return Expression._call("subtract_checked", [self, other]) + + def __truediv__(Expression self, other): + other = Expression._expr_or_scalar(other) + return Expression._call("divide_checked", [self, other]) + + def is_valid(self): + """ + Check whether the expression is not-null (valid). + + This creates a new expression equivalent to calling the + `is_valid` compute function on this expression. + + Returns + ------- + is_valid : Expression + """ + return Expression._call("is_valid", [self]) + + def is_null(self, bint nan_is_null=False): + """ + Check whether the expression is null. + + This creates a new expression equivalent to calling the + `is_null` compute function on this expression. + + Parameters + ---------- + nan_is_null : boolean, default False + Whether floating-point NaNs are considered null. + + Returns + ------- + is_null : Expression + """ + options = NullOptions(nan_is_null=nan_is_null) + return Expression._call("is_null", [self], options) + + def is_nan(self): + """ + Check whether the expression is NaN. + + This creates a new expression equivalent to calling the + `is_nan` compute function on this expression. + + Returns + ------- + is_nan : Expression + """ + return Expression._call("is_nan", [self]) + + def cast(self, type=None, safe=None, options=None): + """ + Explicitly set or change the expression's data type. + + This creates a new expression equivalent to calling the + `cast` compute function on this expression. + + Parameters + ---------- + type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Expression + """ + safe_vars_passed = (safe is not None) or (type is not None) + + if safe_vars_passed and (options is not None): + raise ValueError("Must either pass values for 'type' and 'safe' or pass a " + "value for 'options'") + + if options is None: + type = ensure_type(type, allow_none=False) + if safe is False: + options = CastOptions.unsafe(type) + else: + options = CastOptions.safe(type) + return Expression._call("cast", [self], options) + + def isin(self, values): + """ + Check whether the expression is contained in values. + + This creates a new expression equivalent to calling the + `is_in` compute function on this expression. + + Parameters + ---------- + values : Array or iterable + The values to check for. + + Returns + ------- + isin : Expression + A new expression that, when evaluated, checks whether + this expression's value is contained in `values`. + """ + if not isinstance(values, Array): + values = lib.array(values) + + options = SetLookupOptions(values) + return Expression._call("is_in", [self], options) + + @staticmethod + def _field(name_or_idx not None): + cdef: + CFieldRef c_field + + if isinstance(name_or_idx, int): + return Expression.wrap(CMakeFieldExpressionByIndex(name_or_idx)) + else: + c_field = CFieldRef( tobytes(name_or_idx)) + return Expression.wrap(CMakeFieldExpression(c_field)) + + @staticmethod + def _nested_field(tuple names not None): + cdef: + vector[CFieldRef] nested + + if len(names) == 0: + raise ValueError("nested field reference should be non-empty") + nested.reserve(len(names)) + for name in names: + if isinstance(name, int): + nested.push_back(CFieldRef(name)) + else: + nested.push_back(CFieldRef( tobytes(name))) + return Expression.wrap(CMakeFieldExpression(CFieldRef(move(nested)))) + + @staticmethod + def _scalar(value): + cdef: + Scalar scalar + + if isinstance(value, Scalar): + scalar = value + else: + scalar = lib.scalar(value) + + return Expression.wrap(CMakeScalarExpression(scalar.unwrap())) + + +_deserialize = Expression._deserialize +cdef CExpression _true = CMakeScalarExpression( + make_shared[CBooleanScalar](True) +) + + +cdef CExpression _bind(Expression filter, Schema schema) except *: + assert schema is not None + + if filter is None: + return _true + + return GetResultValue(filter.unwrap().Bind( + deref(pyarrow_unwrap_schema(schema).get()))) + + +cdef class UdfContext: + """ + Per-invocation function context/state. + + This object will always be the first argument to a user-defined + function. It should not be used outside of a call to the function. + """ + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly" + .format(self.__class__.__name__)) + + cdef void init(self, const CUdfContext &c_context): + self.c_context = c_context + + @property + def batch_length(self): + """ + The common length of all input arguments (int). + + In the case that all arguments are scalars, this value + is used to pass the "actual length" of the arguments, + e.g. because the scalar values are encoding a column + with a constant value. + """ + return self.c_context.batch_length + + @property + def memory_pool(self): + """ + A memory pool for allocations (:class:`MemoryPool`). + + This is the memory pool supplied by the user when they invoked + the function and it should be used in any calls to arrow that the + UDF makes if that call accepts a memory_pool. + """ + return box_memory_pool(self.c_context.pool) + + +cdef inline CFunctionDoc _make_function_doc(dict func_doc) except *: + """ + Helper function to generate the FunctionDoc + This function accepts a dictionary and expects the + summary(str), description(str) and arg_names(List[str]) keys. + """ + cdef: + CFunctionDoc f_doc + vector[c_string] c_arg_names + + f_doc.summary = tobytes(func_doc["summary"]) + f_doc.description = tobytes(func_doc["description"]) + for arg_name in func_doc["arg_names"]: + c_arg_names.push_back(tobytes(arg_name)) + f_doc.arg_names = c_arg_names + # UDFOptions integration: + # TODO: https://issues.apache.org/jira/browse/ARROW-16041 + f_doc.options_class = b"" + f_doc.options_required = False + return f_doc + + +cdef object box_udf_context(const CUdfContext& c_context): + cdef UdfContext context = UdfContext.__new__(UdfContext) + context.init(c_context) + return context + + +cdef _udf_callback(user_function, const CUdfContext& c_context, inputs): + """ + Helper callback function used to wrap the UdfContext from Python to C++ + execution. + """ + context = box_udf_context(c_context) + return user_function(context, *inputs) + + +def _get_udf_context(memory_pool, batch_length): + cdef CUdfContext c_context + c_context.pool = maybe_unbox_memory_pool(memory_pool) + c_context.batch_length = batch_length + context = box_udf_context(c_context) + return context + + +ctypedef CStatus (*CRegisterUdf)(PyObject* function, function[CallbackUdf] wrapper, + const CUdfOptions& options, CFunctionRegistry* registry) + +cdef class RegisterUdf(_Weakrefable): + cdef CRegisterUdf register_func + + cdef void init(self, const CRegisterUdf register_func): + self.register_func = register_func + + +cdef get_register_scalar_function(): + cdef RegisterUdf reg = RegisterUdf.__new__(RegisterUdf) + reg.register_func = RegisterScalarFunction + return reg + + +cdef get_register_tabular_function(): + cdef RegisterUdf reg = RegisterUdf.__new__(RegisterUdf) + reg.register_func = RegisterTabularFunction + return reg + + +cdef get_register_aggregate_function(): + cdef RegisterUdf reg = RegisterUdf.__new__(RegisterUdf) + reg.register_func = RegisterAggregateFunction + return reg + +cdef get_register_vector_function(): + cdef RegisterUdf reg = RegisterUdf.__new__(RegisterUdf) + reg.register_func = RegisterVectorFunction + return reg + + +def register_scalar_function(func, function_name, function_doc, in_types, out_type, + func_registry=None): + """ + Register a user-defined scalar function. + + This API is EXPERIMENTAL. + + A scalar function is a function that executes elementwise + operations on arrays or scalars, i.e. a scalar function must + be computed row-by-row with no state where each output row + is computed only from its corresponding input row. + In other words, all argument arrays have the same length, + and the output array is of the same length as the arguments. + Scalar functions are the only functions allowed in query engine + expressions. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple udf" + >>> func_doc["description"] = "add a constant to a scalar" + >>> + >>> def add_constant(ctx, array): + ... return pc.add(array, 1, memory_pool=ctx.memory_pool) + >>> + >>> func_name = "py_add_func" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.int64() + >>> pc.register_scalar_function(add_constant, func_name, func_doc, + ... in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_add_func' + >>> answer = pc.call_function(func_name, [pa.array([20])]) + >>> answer + + [ + 21 + ] + """ + return _register_user_defined_function(get_register_scalar_function(), + func, function_name, function_doc, in_types, + out_type, func_registry) + + +def register_vector_function(func, function_name, function_doc, in_types, out_type, + func_registry=None): + """ + Register a user-defined vector function. + + This API is EXPERIMENTAL. + + A vector function is a function that executes vector + operations on arrays. Vector function is often used + when compute doesn't fit other more specific types of + functions (e.g., scalar and aggregate). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "percent rank" + >>> func_doc["description"] = "compute percent rank" + >>> + >>> def list_flatten_udf(ctx, x): + ... return pc.list_flatten(x) + >>> + >>> func_name = "list_flatten_udf" + >>> in_types = {"array": pa.list_(pa.int64())} + >>> out_type = pa.int64() + >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, + ... in_types, out_type) + >>> + >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) + >>> answer + + [ + 1, + 2, + 3, + 4 + ] + """ + return _register_user_defined_function(get_register_vector_function(), + func, function_name, function_doc, in_types, + out_type, func_registry) + + +def register_aggregate_function(func, function_name, function_doc, in_types, out_type, + func_registry=None): + """ + Register a user-defined non-decomposable aggregate function. + + This API is EXPERIMENTAL. + + A non-decomposable aggregation function is a function that executes + aggregate operations on the whole data that it is aggregating. + In other words, non-decomposable aggregate function cannot be + split into consume/merge/finalize steps. + + This is often used with ordered or segmented aggregation where groups + can be emit before accumulating all of the input data. + + Note that currently the size of any input column cannot exceed 2 GB + for a single segment (all groups combined). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return a Scalar matching the + out_type. + To define a varargs function, pass a callable that takes + *args. The in_type needs to match in type of inputs when + the function gets called. + function_name : str + Name of the function. This name must be unique, i.e., + there should only be one function registered with + this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import numpy as np + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple median udf" + >>> func_doc["description"] = "compute median" + >>> + >>> def compute_median(ctx, array): + ... return pa.scalar(np.median(array)) + >>> + >>> func_name = "py_compute_median" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.float64() + >>> pc.register_aggregate_function(compute_median, func_name, func_doc, + ... in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_compute_median' + >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) + >>> answer + + >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=['k', 'v']) + >>> result = table.group_by('k').aggregate([('v', 'py_compute_median')]) + >>> result + pyarrow.Table + k: int64 + v_py_compute_median: double + ---- + k: [[1,2]] + v_py_compute_median: [[15,35]] + """ + return _register_user_defined_function(get_register_aggregate_function(), + func, function_name, function_doc, in_types, + out_type, func_registry) + + +def register_tabular_function(func, function_name, function_doc, in_types, out_type, + func_registry=None): + """ + Register a user-defined tabular function. + + This API is EXPERIMENTAL. + + A tabular function is one accepting a context argument of type + UdfContext and returning a generator of struct arrays. + The in_types argument must be empty and the out_type argument + specifies a schema. Each struct array must have field types + corresponding to the schema. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The only argument is the context argument of type + UdfContext. It must return a callable that + returns on each invocation a StructArray matching + the out_type, where an empty array indicates end. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + Must be an empty dictionary (reserved for future use). + out_type : Union[Schema, DataType] + Schema of the function's output, or a corresponding flat struct type. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + cdef: + shared_ptr[CSchema] c_schema + shared_ptr[CDataType] c_type + + if isinstance(out_type, Schema): + c_schema = pyarrow_unwrap_schema(out_type) + with nogil: + c_type = make_shared[CStructType](deref(c_schema).fields()) + out_type = pyarrow_wrap_data_type(c_type) + return _register_user_defined_function(get_register_tabular_function(), + func, function_name, function_doc, in_types, + out_type, func_registry) + + +def _register_user_defined_function(register_func, func, function_name, function_doc, in_types, + out_type, func_registry=None): + """ + Register a user-defined function. + + This method itself doesn't care about the type of the UDF + (i.e., scalar vs tabular vs aggregate) + + Parameters + ---------- + register_func: object + An object holding a CRegisterUdf in a "register_func" attribute. + func : callable + A callable implementing the user-defined function. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + cdef: + CRegisterUdf c_register_func + c_string c_func_name + CArity c_arity + CFunctionDoc c_func_doc + vector[shared_ptr[CDataType]] c_in_types + PyObject* c_function + shared_ptr[CDataType] c_out_type + CUdfOptions c_options + CFunctionRegistry* c_func_registry + + if callable(func): + c_function = func + else: + raise TypeError("func must be a callable") + + c_func_name = tobytes(function_name) + + func_spec = inspect.getfullargspec(func) + num_args = -1 + if isinstance(in_types, dict): + for in_type in in_types.values(): + c_in_types.push_back( + pyarrow_unwrap_data_type(ensure_type(in_type))) + function_doc["arg_names"] = in_types.keys() + num_args = len(in_types) + else: + raise TypeError( + "in_types must be a dictionary of DataType") + + c_arity = CArity( num_args, func_spec.varargs) + + if "summary" not in function_doc: + raise ValueError("Function doc must contain a summary") + + if "description" not in function_doc: + raise ValueError("Function doc must contain a description") + + if "arg_names" not in function_doc: + raise ValueError("Function doc must contain arg_names") + + c_func_doc = _make_function_doc(function_doc) + + c_out_type = pyarrow_unwrap_data_type(ensure_type(out_type)) + + c_options.func_name = c_func_name + c_options.arity = c_arity + c_options.func_doc = c_func_doc + c_options.input_types = c_in_types + c_options.output_type = c_out_type + + if func_registry is None: + c_func_registry = NULL + else: + c_func_registry = (func_registry).registry + + c_register_func = (register_func).register_func + + check_status(c_register_func(c_function, + &_udf_callback, + c_options, c_func_registry)) + + +def call_tabular_function(function_name, args=None, func_registry=None): + """ + Get a record batch iterator from a tabular function. + + Parameters + ---------- + function_name : str + Name of the function. + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. Currently, only an empty args is supported. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + cdef: + c_string c_func_name + vector[CDatum] c_args + CFunctionRegistry* c_func_registry + shared_ptr[CRecordBatchReader] c_reader + RecordBatchReader reader + + c_func_name = tobytes(function_name) + if func_registry is None: + c_func_registry = NULL + else: + c_func_registry = (func_registry).registry + if args is None: + args = [] + _pack_compute_args(args, &c_args) + + with nogil: + c_reader = GetResultValue(CallTabularFunction( + c_func_name, c_args, c_func_registry)) + reader = RecordBatchReader.__new__(RecordBatchReader) + reader.reader = c_reader + return RecordBatchReader.from_batches(pyarrow_wrap_schema(deref(c_reader).schema()), reader) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute_docstrings.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute_docstrings.py new file mode 100644 index 0000000000000000000000000000000000000000..150dbdb1175803e3c40a1bd2469a4df34ea57e4e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_compute_docstrings.py @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Custom documentation additions for compute functions. +""" + +function_doc_additions = {} + +function_doc_additions["filter"] = """ + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array(["a", "b", "c", None, "e"]) + >>> mask = pa.array([True, False, None, False, True]) + >>> arr.filter(mask) + + [ + "a", + "e" + ] + >>> arr.filter(mask, null_selection_behavior='emit_null') + + [ + "a", + null, + "e" + ] + """ + +function_doc_additions["mode"] = """ + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) + >>> modes = pc.mode(arr, 2) + >>> modes[0] + + >>> modes[1] + + """ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..88fa17f3a29c8d27c4d07e06c03735b11303f256 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.pxd new file mode 100644 index 0000000000000000000000000000000000000000..dcc562a41c795896d12fc7cdd3baebf0122bedc9 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.pxd @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.includes.libarrow cimport * +from pyarrow.lib cimport _Weakrefable + + +cdef class ConvertOptions(_Weakrefable): + cdef: + unique_ptr[CCSVConvertOptions] options + + @staticmethod + cdef ConvertOptions wrap(CCSVConvertOptions options) + + +cdef class ParseOptions(_Weakrefable): + cdef: + unique_ptr[CCSVParseOptions] options + object _invalid_row_handler + + @staticmethod + cdef ParseOptions wrap(CCSVParseOptions options) + + +cdef class ReadOptions(_Weakrefable): + cdef: + unique_ptr[CCSVReadOptions] options + public object encoding + + @staticmethod + cdef ReadOptions wrap(CCSVReadOptions options) + + +cdef class WriteOptions(_Weakrefable): + cdef: + unique_ptr[CCSVWriteOptions] options + + @staticmethod + cdef WriteOptions wrap(CCSVWriteOptions options) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.pyx new file mode 100644 index 0000000000000000000000000000000000000000..508488c0c3b3c3bcd2d2157f57f625b1e5b92c2e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_csv.pyx @@ -0,0 +1,1542 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: language_level = 3 + +from cython.operator cimport dereference as deref + +from collections import namedtuple +from collections.abc import Mapping + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_python cimport * +from pyarrow.lib cimport (check_status, Field, MemoryPool, Schema, + RecordBatchReader, ensure_type, + maybe_unbox_memory_pool, get_input_stream, + get_writer, native_transcoding_input_stream, + pyarrow_unwrap_batch, pyarrow_unwrap_schema, + pyarrow_unwrap_table, pyarrow_wrap_schema, + pyarrow_wrap_table, pyarrow_wrap_data_type, + pyarrow_unwrap_data_type, Table, RecordBatch, + StopToken, _CRecordBatchWriter) +from pyarrow.lib import frombytes, tobytes, SignalStopHandler + + +cdef unsigned char _single_char(s) except 0: + val = ord(s) + if val == 0 or val > 127: + raise ValueError("Expecting an ASCII character") + return val + + +_InvalidRow = namedtuple( + "_InvalidRow", ("expected_columns", "actual_columns", "number", "text"), + module=__name__) + + +class InvalidRow(_InvalidRow): + """ + Description of an invalid row in a CSV file. + + Parameters + ---------- + expected_columns : int + The expected number of columns in the row. + actual_columns : int + The actual number of columns in the row. + number : int or None + The physical row number if known, otherwise None. + text : str + The contents of the row. + """ + __slots__ = () + + +cdef CInvalidRowResult _handle_invalid_row( + handler, const CCSVInvalidRow& c_row) except CInvalidRowResult_Error: + # A negative row number means undetermined (because of parallel reading) + row_number = c_row.number if c_row.number >= 0 else None + row = InvalidRow(c_row.expected_columns, c_row.actual_columns, + row_number, frombytes( c_row.text)) + result = handler(row) + if result == 'error': + return CInvalidRowResult_Error + elif result == 'skip': + return CInvalidRowResult_Skip + else: + raise ValueError("Invalid return value for invalid row handler: " + f"expected 'error' or 'skip', got {result!r}") + + +cdef class ReadOptions(_Weakrefable): + """ + Options for reading CSV files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual record batches or table chunks. + Minimum valid value for block size is 1 + skip_rows : int, optional (default 0) + The number of rows to skip before the column names (if any) + and the CSV data. + skip_rows_after_names : int, optional (default 0) + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names are read (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). + column_names : list, optional + The column names of the target table. If empty, fall back on + `autogenerate_column_names`. + autogenerate_column_names : bool, optional (default False) + Whether to autogenerate column names if `column_names` is empty. + If true, column names will be of the form "f0", "f1"... + If false, column names will be read from the first CSV row + after `skip_rows`. + encoding : str, optional (default 'utf8') + The character encoding of the CSV data. Columns that cannot + decode using this encoding can still be read as Binary. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" + >>> print(s) + 1,2,3 + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + + Ignore the first numbered row and substitute it with defined + or autogenerated column names: + + >>> from pyarrow import csv + >>> read_options = csv.ReadOptions( + ... column_names=["animals", "n_legs", "entry"], + ... skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + >>> read_options = csv.ReadOptions(autogenerate_column_names=True, + ... skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + f0: string + f1: int64 + f2: date32[day] + ---- + f0: [["Flamingo","Horse","Brittle stars","Centipede"]] + f1: [[2,4,5,100]] + f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + Remove the first 2 rows of the data: + + >>> read_options = csv.ReadOptions(skip_rows_after_names=2) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + 1: string + 2: int64 + 3: date32[day] + ---- + 1: [["Brittle stars","Centipede"]] + 2: [[5,100]] + 3: [[2022-03-03,2022-03-04]] + """ + + # Avoid mistakingly creating attributes + __slots__ = () + + # __init__() is not called when unpickling, initialize storage here + def __cinit__(self, *argw, **kwargs): + self.options.reset(new CCSVReadOptions(CCSVReadOptions.Defaults())) + + def __init__(self, *, use_threads=None, block_size=None, skip_rows=None, + skip_rows_after_names=None, column_names=None, + autogenerate_column_names=None, encoding='utf8'): + if use_threads is not None: + self.use_threads = use_threads + if block_size is not None: + self.block_size = block_size + if skip_rows is not None: + self.skip_rows = skip_rows + if skip_rows_after_names is not None: + self.skip_rows_after_names = skip_rows_after_names + if column_names is not None: + self.column_names = column_names + if autogenerate_column_names is not None: + self.autogenerate_column_names= autogenerate_column_names + # Python-specific option + self.encoding = encoding + + @property + def use_threads(self): + """ + Whether to use multiple threads to accelerate reading. + """ + return deref(self.options).use_threads + + @use_threads.setter + def use_threads(self, value): + deref(self.options).use_threads = value + + @property + def block_size(self): + """ + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual record batches or table chunks. + """ + return deref(self.options).block_size + + @block_size.setter + def block_size(self, value): + deref(self.options).block_size = value + + @property + def skip_rows(self): + """ + The number of rows to skip before the column names (if any) + and the CSV data. + See `skip_rows_after_names` for interaction description + """ + return deref(self.options).skip_rows + + @skip_rows.setter + def skip_rows(self, value): + deref(self.options).skip_rows = value + + @property + def skip_rows_after_names(self): + """ + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names are read (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). + """ + return deref(self.options).skip_rows_after_names + + @skip_rows_after_names.setter + def skip_rows_after_names(self, value): + deref(self.options).skip_rows_after_names = value + + @property + def column_names(self): + """ + The column names of the target table. If empty, fall back on + `autogenerate_column_names`. + """ + return [frombytes(s) for s in deref(self.options).column_names] + + @column_names.setter + def column_names(self, value): + deref(self.options).column_names.clear() + for item in value: + deref(self.options).column_names.push_back(tobytes(item)) + + @property + def autogenerate_column_names(self): + """ + Whether to autogenerate column names if `column_names` is empty. + If true, column names will be of the form "f0", "f1"... + If false, column names will be read from the first CSV row + after `skip_rows`. + """ + return deref(self.options).autogenerate_column_names + + @autogenerate_column_names.setter + def autogenerate_column_names(self, value): + deref(self.options).autogenerate_column_names = value + + def validate(self): + check_status(deref(self.options).Validate()) + + def equals(self, ReadOptions other): + """ + Parameters + ---------- + other : pyarrow.csv.ReadOptions + + Returns + ------- + bool + """ + return ( + self.use_threads == other.use_threads and + self.block_size == other.block_size and + self.skip_rows == other.skip_rows and + self.skip_rows_after_names == other.skip_rows_after_names and + self.column_names == other.column_names and + self.autogenerate_column_names == + other.autogenerate_column_names and + self.encoding == other.encoding + ) + + @staticmethod + cdef ReadOptions wrap(CCSVReadOptions options): + out = ReadOptions() + out.options.reset(new CCSVReadOptions(move(options))) + out.encoding = 'utf8' # No way to know this + return out + + def __getstate__(self): + return (self.use_threads, self.block_size, self.skip_rows, + self.column_names, self.autogenerate_column_names, + self.encoding, self.skip_rows_after_names) + + def __setstate__(self, state): + (self.use_threads, self.block_size, self.skip_rows, + self.column_names, self.autogenerate_column_names, + self.encoding, self.skip_rows_after_names) = state + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False + + +cdef class ParseOptions(_Weakrefable): + """ + Options for parsing CSV files. + + Parameters + ---------- + delimiter : 1-character string, optional (default ',') + The character delimiting individual cells in the CSV data. + quote_char : 1-character string or False, optional (default '"') + The character used optionally for quoting CSV values + (False if quoting is not allowed). + double_quote : bool, optional (default True) + Whether two quotes in a quoted CSV value denote a single quote + in the data. + escape_char : 1-character string or False, optional (default False) + The character used optionally for escaping special characters + (False if escaping is not allowed). + newlines_in_values : bool, optional (default False) + Whether newline characters are allowed in CSV values. + Setting this to True reduces the performance of multi-threaded + CSV reading. + ignore_empty_lines : bool, optional (default True) + Whether empty lines are ignored in CSV input. + If False, an empty line is interpreted as containing a single empty + value (assuming a one-column CSV file). + invalid_row_handler : callable, optional (default None) + If not None, this object is called for each CSV row that fails + parsing (because of a mismatching number of columns). + It should accept a single InvalidRow argument and return either + "skip" or "error" depending on the desired outcome. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals;n_legs;entry\\n" + ... "Flamingo;2;2022-03-01\\n" + ... "# Comment here:\\n" + ... "Horse;4;2022-03-02\\n" + ... "Brittle stars;5;2022-03-03\\n" + ... "Centipede;100;2022-03-04" + ... ) + >>> print(s) + animals;n_legs;entry + Flamingo;2;2022-03-01 + # Comment here: + Horse;4;2022-03-02 + Brittle stars;5;2022-03-03 + Centipede;100;2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Read the data from a file skipping rows with comments + and defining the delimiter: + + >>> from pyarrow import csv + >>> def skip_comment(row): + ... if row.text.startswith("# "): + ... return 'skip' + ... else: + ... return 'error' + ... + >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) + >>> csv.read_csv(source, parse_options=parse_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + __slots__ = () + + def __cinit__(self, *argw, **kwargs): + self._invalid_row_handler = None + self.options.reset(new CCSVParseOptions(CCSVParseOptions.Defaults())) + + def __init__(self, *, delimiter=None, quote_char=None, double_quote=None, + escape_char=None, newlines_in_values=None, + ignore_empty_lines=None, invalid_row_handler=None): + if delimiter is not None: + self.delimiter = delimiter + if quote_char is not None: + self.quote_char = quote_char + if double_quote is not None: + self.double_quote = double_quote + if escape_char is not None: + self.escape_char = escape_char + if newlines_in_values is not None: + self.newlines_in_values = newlines_in_values + if ignore_empty_lines is not None: + self.ignore_empty_lines = ignore_empty_lines + if invalid_row_handler is not None: + self.invalid_row_handler = invalid_row_handler + + @property + def delimiter(self): + """ + The character delimiting individual cells in the CSV data. + """ + return chr(deref(self.options).delimiter) + + @delimiter.setter + def delimiter(self, value): + deref(self.options).delimiter = _single_char(value) + + @property + def quote_char(self): + """ + The character used optionally for quoting CSV values + (False if quoting is not allowed). + """ + if deref(self.options).quoting: + return chr(deref(self.options).quote_char) + else: + return False + + @quote_char.setter + def quote_char(self, value): + if value is False: + deref(self.options).quoting = False + else: + deref(self.options).quote_char = _single_char(value) + deref(self.options).quoting = True + + @property + def double_quote(self): + """ + Whether two quotes in a quoted CSV value denote a single quote + in the data. + """ + return deref(self.options).double_quote + + @double_quote.setter + def double_quote(self, value): + deref(self.options).double_quote = value + + @property + def escape_char(self): + """ + The character used optionally for escaping special characters + (False if escaping is not allowed). + """ + if deref(self.options).escaping: + return chr(deref(self.options).escape_char) + else: + return False + + @escape_char.setter + def escape_char(self, value): + if value is False: + deref(self.options).escaping = False + else: + deref(self.options).escape_char = _single_char(value) + deref(self.options).escaping = True + + @property + def newlines_in_values(self): + """ + Whether newline characters are allowed in CSV values. + Setting this to True reduces the performance of multi-threaded + CSV reading. + """ + return deref(self.options).newlines_in_values + + @newlines_in_values.setter + def newlines_in_values(self, value): + deref(self.options).newlines_in_values = value + + @property + def ignore_empty_lines(self): + """ + Whether empty lines are ignored in CSV input. + If False, an empty line is interpreted as containing a single empty + value (assuming a one-column CSV file). + """ + return deref(self.options).ignore_empty_lines + + @property + def invalid_row_handler(self): + """ + Optional handler for invalid rows. + + If not None, this object is called for each CSV row that fails + parsing (because of a mismatching number of columns). + It should accept a single InvalidRow argument and return either + "skip" or "error" depending on the desired outcome. + """ + return self._invalid_row_handler + + @invalid_row_handler.setter + def invalid_row_handler(self, value): + if value is not None and not callable(value): + raise TypeError("Expected callable or None, " + f"got instance of {type(value)!r}") + self._invalid_row_handler = value + deref(self.options).invalid_row_handler = MakeInvalidRowHandler( + &_handle_invalid_row, value) + + @ignore_empty_lines.setter + def ignore_empty_lines(self, value): + deref(self.options).ignore_empty_lines = value + + def validate(self): + check_status(deref(self.options).Validate()) + + def equals(self, ParseOptions other): + """ + Parameters + ---------- + other : pyarrow.csv.ParseOptions + + Returns + ------- + bool + """ + return ( + self.delimiter == other.delimiter and + self.quote_char == other.quote_char and + self.double_quote == other.double_quote and + self.escape_char == other.escape_char and + self.newlines_in_values == other.newlines_in_values and + self.ignore_empty_lines == other.ignore_empty_lines and + self._invalid_row_handler == other._invalid_row_handler + ) + + @staticmethod + cdef ParseOptions wrap(CCSVParseOptions options): + out = ParseOptions() + out.options.reset(new CCSVParseOptions(move(options))) + return out + + def __getstate__(self): + return (self.delimiter, self.quote_char, self.double_quote, + self.escape_char, self.newlines_in_values, + self.ignore_empty_lines, self.invalid_row_handler) + + def __setstate__(self, state): + (self.delimiter, self.quote_char, self.double_quote, + self.escape_char, self.newlines_in_values, + self.ignore_empty_lines, self.invalid_row_handler) = state + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False + + +cdef class _ISO8601(_Weakrefable): + """ + A special object indicating ISO-8601 parsing. + """ + __slots__ = () + + def __str__(self): + return 'ISO8601' + + def __eq__(self, other): + return isinstance(other, _ISO8601) + + +ISO8601 = _ISO8601() + + +cdef class ConvertOptions(_Weakrefable): + """ + Options for converting CSV data. + + Parameters + ---------- + check_utf8 : bool, optional (default True) + Whether to check UTF8 validity of string columns. + column_types : pyarrow.Schema or dict, optional + Explicitly map column names to column types. Passing this argument + disables type inference on the defined columns. + null_values : list, optional + A sequence of strings that denote nulls in the data + (defaults are appropriate in most cases). Note that by default, + string columns are not checked for null values. To enable + null checking for those, specify ``strings_can_be_null=True``. + true_values : list, optional + A sequence of strings that denote true booleans in the data + (defaults are appropriate in most cases). + false_values : list, optional + A sequence of strings that denote false booleans in the data + (defaults are appropriate in most cases). + decimal_point : 1-character string, optional (default '.') + The character used as decimal point in floating-point and decimal + data. + strings_can_be_null : bool, optional (default False) + Whether string / binary columns can have null values. + If true, then strings in null_values are considered null for + string columns. + If false, then all strings are valid string values. + quoted_strings_can_be_null : bool, optional (default True) + Whether quoted values can be null. + If true, then strings in "null_values" are also considered null + when they appear quoted in the CSV file. Otherwise, quoted values + are never considered null. + include_columns : list, optional + The names of columns to include in the Table. + If empty, the Table will include all columns from the CSV file. + If not empty, only these columns will be included, in this order. + include_missing_columns : bool, optional (default False) + If false, columns in `include_columns` but not in the CSV file will + error out. + If true, columns in `include_columns` but not in the CSV file will + produce a column of nulls (whose type is selected using + `column_types`, or null by default). + This option is ignored if `include_columns` is empty. + auto_dict_encode : bool, optional (default False) + Whether to try to automatically dict-encode string / binary data. + If true, then when type inference detects a string or binary column, + it it dict-encoded up to `auto_dict_max_cardinality` distinct values + (per chunk), after which it switches to regular encoding. + This setting is ignored for non-inferred columns (those in + `column_types`). + auto_dict_max_cardinality : int, optional + The maximum dictionary cardinality for `auto_dict_encode`. + This value is per chunk. + timestamp_parsers : list, optional + A sequence of strptime()-compatible format strings, tried in order + when attempting to infer or convert timestamp values (the special + value ISO8601() can also be given). By default, a fast built-in + ISO-8601 parser is used. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry,fast\\n" + ... "Flamingo,2,01/03/2022,Yes\\n" + ... "Horse,4,02/03/2022,Yes\\n" + ... "Brittle stars,5,03/03/2022,No\\n" + ... "Centipede,100,04/03/2022,No\\n" + ... ",6,05/03/2022," + ... ) + >>> print(s) + animals,n_legs,entry,fast + Flamingo,2,01/03/2022,Yes + Horse,4,02/03/2022,Yes + Brittle stars,5,03/03/2022,No + Centipede,100,04/03/2022,No + ,6,05/03/2022, + + Change the type of a column: + + >>> import pyarrow as pa + >>> from pyarrow import csv + >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: double + entry: string + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] + fast: [["Yes","Yes","No","No",""]] + + Define a date parsing format to get a timestamp type column + (in case dates are not in ISO format and not converted by default): + + >>> convert_options = csv.ConvertOptions( + ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: timestamp[s] + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [["Yes","Yes","No","No",""]] + + Specify a subset of columns to be read: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + + List additional column to be included as a null typed column: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs", "location"], + ... include_missing_columns=True) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + location: null + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + location: [5 nulls] + + Define columns as dictionary type (by default only the + string/binary columns are dictionary encoded): + + >>> convert_options = csv.ConvertOptions( + ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], + ... auto_dict_encode=True) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: dictionary + n_legs: int64 + entry: timestamp[s] + fast: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: + [0,1,2,3,4]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [ -- dictionary: + ["Yes","No",""] -- indices: + [0,0,1,1,2]] + + Set upper limit for the number of categories. If the categories + is more than the limit, the conversion to dictionary will not + happen: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals"], + ... auto_dict_encode=True, + ... auto_dict_max_cardinality=2) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + + Set empty strings to missing values: + + >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"], + ... strings_can_be_null=True) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] + n_legs: [[2,4,5,100,6]] + + Define values to be True and False when converting a column + into a bool type: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["fast"], + ... false_values=["No"], + ... true_values=["Yes"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + fast: bool + ---- + fast: [[true,true,false,false,null]] + """ + + # Avoid mistakingly creating attributes + __slots__ = () + + def __cinit__(self, *argw, **kwargs): + self.options.reset( + new CCSVConvertOptions(CCSVConvertOptions.Defaults())) + + def __init__(self, *, check_utf8=None, column_types=None, null_values=None, + true_values=None, false_values=None, decimal_point=None, + strings_can_be_null=None, quoted_strings_can_be_null=None, + include_columns=None, include_missing_columns=None, + auto_dict_encode=None, auto_dict_max_cardinality=None, + timestamp_parsers=None): + if check_utf8 is not None: + self.check_utf8 = check_utf8 + if column_types is not None: + self.column_types = column_types + if null_values is not None: + self.null_values = null_values + if true_values is not None: + self.true_values = true_values + if false_values is not None: + self.false_values = false_values + if decimal_point is not None: + self.decimal_point = decimal_point + if strings_can_be_null is not None: + self.strings_can_be_null = strings_can_be_null + if quoted_strings_can_be_null is not None: + self.quoted_strings_can_be_null = quoted_strings_can_be_null + if include_columns is not None: + self.include_columns = include_columns + if include_missing_columns is not None: + self.include_missing_columns = include_missing_columns + if auto_dict_encode is not None: + self.auto_dict_encode = auto_dict_encode + if auto_dict_max_cardinality is not None: + self.auto_dict_max_cardinality = auto_dict_max_cardinality + if timestamp_parsers is not None: + self.timestamp_parsers = timestamp_parsers + + @property + def check_utf8(self): + """ + Whether to check UTF8 validity of string columns. + """ + return deref(self.options).check_utf8 + + @check_utf8.setter + def check_utf8(self, value): + deref(self.options).check_utf8 = value + + @property + def strings_can_be_null(self): + """ + Whether string / binary columns can have null values. + """ + return deref(self.options).strings_can_be_null + + @strings_can_be_null.setter + def strings_can_be_null(self, value): + deref(self.options).strings_can_be_null = value + + @property + def quoted_strings_can_be_null(self): + """ + Whether quoted values can be null. + """ + return deref(self.options).quoted_strings_can_be_null + + @quoted_strings_can_be_null.setter + def quoted_strings_can_be_null(self, value): + deref(self.options).quoted_strings_can_be_null = value + + @property + def column_types(self): + """ + Explicitly map column names to column types. + """ + d = {frombytes(item.first): pyarrow_wrap_data_type(item.second) + for item in deref(self.options).column_types} + return d + + @column_types.setter + def column_types(self, value): + cdef: + shared_ptr[CDataType] typ + + if isinstance(value, Mapping): + value = value.items() + + deref(self.options).column_types.clear() + for item in value: + if isinstance(item, Field): + k = item.name + v = item.type + else: + k, v = item + typ = pyarrow_unwrap_data_type(ensure_type(v)) + assert typ != NULL + deref(self.options).column_types[tobytes(k)] = typ + + @property + def null_values(self): + """ + A sequence of strings that denote nulls in the data. + """ + return [frombytes(x) for x in deref(self.options).null_values] + + @null_values.setter + def null_values(self, value): + deref(self.options).null_values = [tobytes(x) for x in value] + + @property + def true_values(self): + """ + A sequence of strings that denote true booleans in the data. + """ + return [frombytes(x) for x in deref(self.options).true_values] + + @true_values.setter + def true_values(self, value): + deref(self.options).true_values = [tobytes(x) for x in value] + + @property + def false_values(self): + """ + A sequence of strings that denote false booleans in the data. + """ + return [frombytes(x) for x in deref(self.options).false_values] + + @false_values.setter + def false_values(self, value): + deref(self.options).false_values = [tobytes(x) for x in value] + + @property + def decimal_point(self): + """ + The character used as decimal point in floating-point and decimal + data. + """ + return chr(deref(self.options).decimal_point) + + @decimal_point.setter + def decimal_point(self, value): + deref(self.options).decimal_point = _single_char(value) + + @property + def auto_dict_encode(self): + """ + Whether to try to automatically dict-encode string / binary data. + """ + return deref(self.options).auto_dict_encode + + @auto_dict_encode.setter + def auto_dict_encode(self, value): + deref(self.options).auto_dict_encode = value + + @property + def auto_dict_max_cardinality(self): + """ + The maximum dictionary cardinality for `auto_dict_encode`. + + This value is per chunk. + """ + return deref(self.options).auto_dict_max_cardinality + + @auto_dict_max_cardinality.setter + def auto_dict_max_cardinality(self, value): + deref(self.options).auto_dict_max_cardinality = value + + @property + def include_columns(self): + """ + The names of columns to include in the Table. + + If empty, the Table will include all columns from the CSV file. + If not empty, only these columns will be included, in this order. + """ + return [frombytes(s) for s in deref(self.options).include_columns] + + @include_columns.setter + def include_columns(self, value): + deref(self.options).include_columns.clear() + for item in value: + deref(self.options).include_columns.push_back(tobytes(item)) + + @property + def include_missing_columns(self): + """ + If false, columns in `include_columns` but not in the CSV file will + error out. + If true, columns in `include_columns` but not in the CSV file will + produce a null column (whose type is selected using `column_types`, + or null by default). + This option is ignored if `include_columns` is empty. + """ + return deref(self.options).include_missing_columns + + @include_missing_columns.setter + def include_missing_columns(self, value): + deref(self.options).include_missing_columns = value + + @property + def timestamp_parsers(self): + """ + A sequence of strptime()-compatible format strings, tried in order + when attempting to infer or convert timestamp values (the special + value ISO8601() can also be given). By default, a fast built-in + ISO-8601 parser is used. + """ + cdef: + shared_ptr[CTimestampParser] c_parser + c_string kind + + parsers = [] + for c_parser in deref(self.options).timestamp_parsers: + kind = deref(c_parser).kind() + if kind == b'strptime': + parsers.append(frombytes(deref(c_parser).format())) + else: + assert kind == b'iso8601' + parsers.append(ISO8601) + + return parsers + + @timestamp_parsers.setter + def timestamp_parsers(self, value): + cdef: + vector[shared_ptr[CTimestampParser]] c_parsers + + for v in value: + if isinstance(v, str): + c_parsers.push_back(CTimestampParser.MakeStrptime(tobytes(v))) + elif v == ISO8601: + c_parsers.push_back(CTimestampParser.MakeISO8601()) + else: + raise TypeError("Expected list of str or ISO8601 objects") + + deref(self.options).timestamp_parsers = move(c_parsers) + + @staticmethod + cdef ConvertOptions wrap(CCSVConvertOptions options): + out = ConvertOptions() + out.options.reset(new CCSVConvertOptions(move(options))) + return out + + def validate(self): + check_status(deref(self.options).Validate()) + + def equals(self, ConvertOptions other): + """ + Parameters + ---------- + other : pyarrow.csv.ConvertOptions + + Returns + ------- + bool + """ + return ( + self.check_utf8 == other.check_utf8 and + self.column_types == other.column_types and + self.null_values == other.null_values and + self.true_values == other.true_values and + self.false_values == other.false_values and + self.decimal_point == other.decimal_point and + self.timestamp_parsers == other.timestamp_parsers and + self.strings_can_be_null == other.strings_can_be_null and + self.quoted_strings_can_be_null == + other.quoted_strings_can_be_null and + self.auto_dict_encode == other.auto_dict_encode and + self.auto_dict_max_cardinality == + other.auto_dict_max_cardinality and + self.include_columns == other.include_columns and + self.include_missing_columns == other.include_missing_columns + ) + + def __getstate__(self): + return (self.check_utf8, self.column_types, self.null_values, + self.true_values, self.false_values, self.decimal_point, + self.timestamp_parsers, self.strings_can_be_null, + self.quoted_strings_can_be_null, self.auto_dict_encode, + self.auto_dict_max_cardinality, self.include_columns, + self.include_missing_columns) + + def __setstate__(self, state): + (self.check_utf8, self.column_types, self.null_values, + self.true_values, self.false_values, self.decimal_point, + self.timestamp_parsers, self.strings_can_be_null, + self.quoted_strings_can_be_null, self.auto_dict_encode, + self.auto_dict_max_cardinality, self.include_columns, + self.include_missing_columns) = state + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False + + +cdef _get_reader(input_file, ReadOptions read_options, + shared_ptr[CInputStream]* out): + use_memory_map = False + get_input_stream(input_file, use_memory_map, out) + if read_options is not None: + out[0] = native_transcoding_input_stream(out[0], + read_options.encoding, + 'utf8') + + +cdef _get_read_options(ReadOptions read_options, CCSVReadOptions* out): + if read_options is None: + out[0] = CCSVReadOptions.Defaults() + else: + out[0] = deref(read_options.options) + + +cdef _get_parse_options(ParseOptions parse_options, CCSVParseOptions* out): + if parse_options is None: + out[0] = CCSVParseOptions.Defaults() + else: + out[0] = deref(parse_options.options) + + +cdef _get_convert_options(ConvertOptions convert_options, + CCSVConvertOptions* out): + if convert_options is None: + out[0] = CCSVConvertOptions.Defaults() + else: + out[0] = deref(convert_options.options) + + +cdef class CSVStreamingReader(RecordBatchReader): + """An object that reads record batches incrementally from a CSV file. + + Should not be instantiated directly by user code. + """ + cdef readonly: + Schema schema + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly, " + "use pyarrow.csv.open_csv() instead." + .format(self.__class__.__name__)) + + # Note about cancellation: we cannot create a SignalStopHandler + # by default here, as several CSVStreamingReader instances may be + # created (including by the same thread). Handling cancellation + # would require having the user pass the SignalStopHandler. + # (in addition to solving ARROW-11853) + + cdef _open(self, shared_ptr[CInputStream] stream, + CCSVReadOptions c_read_options, + CCSVParseOptions c_parse_options, + CCSVConvertOptions c_convert_options, + MemoryPool memory_pool): + cdef: + shared_ptr[CSchema] c_schema + CIOContext io_context + + io_context = CIOContext(maybe_unbox_memory_pool(memory_pool)) + + with nogil: + self.reader = GetResultValue( + CCSVStreamingReader.Make( + io_context, stream, + move(c_read_options), move(c_parse_options), + move(c_convert_options))) + c_schema = self.reader.get().schema() + + self.schema = pyarrow_wrap_schema(c_schema) + + +def read_csv(input_file, read_options=None, parse_options=None, + convert_options=None, MemoryPool memory_pool=None): + """ + Read a Table from a stream of CSV data. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate Table memory from + + Returns + ------- + :class:`pyarrow.Table` + Contents of the CSV file as a in-memory table. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry\\n" + ... "Flamingo,2,2022-03-01\\n" + ... "Horse,4,2022-03-02\\n" + ... "Brittle stars,5,2022-03-03\\n" + ... "Centipede,100,2022-03-04" + ... ) + >>> print(s) + animals,n_legs,entry + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Reading from the file + + >>> from pyarrow import csv + >>> csv.read_csv(source) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + cdef: + shared_ptr[CInputStream] stream + CCSVReadOptions c_read_options + CCSVParseOptions c_parse_options + CCSVConvertOptions c_convert_options + CIOContext io_context + SharedPtrNoGIL[CCSVReader] reader + shared_ptr[CTable] table + + _get_reader(input_file, read_options, &stream) + _get_read_options(read_options, &c_read_options) + _get_parse_options(parse_options, &c_parse_options) + _get_convert_options(convert_options, &c_convert_options) + + with SignalStopHandler() as stop_handler: + io_context = CIOContext( + maybe_unbox_memory_pool(memory_pool), + ( stop_handler.stop_token).stop_token) + reader = GetResultValue(CCSVReader.Make( + io_context, stream, + c_read_options, c_parse_options, c_convert_options)) + + with nogil: + table = GetResultValue(reader.get().Read()) + + return pyarrow_wrap_table(table) + + +def open_csv(input_file, read_options=None, parse_options=None, + convert_options=None, MemoryPool memory_pool=None): + """ + Open a streaming reader of CSV data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate Table memory from + + Returns + ------- + :class:`pyarrow.csv.CSVStreamingReader` + """ + cdef: + shared_ptr[CInputStream] stream + CCSVReadOptions c_read_options + CCSVParseOptions c_parse_options + CCSVConvertOptions c_convert_options + CSVStreamingReader reader + + _get_reader(input_file, read_options, &stream) + _get_read_options(read_options, &c_read_options) + _get_parse_options(parse_options, &c_parse_options) + _get_convert_options(convert_options, &c_convert_options) + + reader = CSVStreamingReader.__new__(CSVStreamingReader) + reader._open(stream, move(c_read_options), move(c_parse_options), + move(c_convert_options), memory_pool) + return reader + + +def _raise_invalid_function_option(value, description, *, + exception_class=ValueError): + raise exception_class(f"\"{value}\" is not a valid {description}") + + +cdef CQuotingStyle unwrap_quoting_style(quoting_style) except *: + if quoting_style == "needed": + return CQuotingStyle_Needed + elif quoting_style == "all_valid": + return CQuotingStyle_AllValid + elif quoting_style == "none": + return CQuotingStyle_None + _raise_invalid_function_option(quoting_style, "quoting style") + + +cdef wrap_quoting_style(quoting_style): + if quoting_style == CQuotingStyle_Needed: + return 'needed' + elif quoting_style == CQuotingStyle_AllValid: + return 'all_valid' + elif quoting_style == CQuotingStyle_None: + return 'none' + + +cdef class WriteOptions(_Weakrefable): + """ + Options for writing CSV files. + + Parameters + ---------- + include_header : bool, optional (default True) + Whether to write an initial header line with column names + batch_size : int, optional (default 1024) + How many rows to process together when converting and writing + CSV data + delimiter : 1-character string, optional (default ",") + The character delimiting individual cells in the CSV data. + quoting_style : str, optional (default "needed") + Whether to quote values, and if so, which quoting style to use. + The following values are accepted: + + - "needed" (default): only enclose values in quotes when needed. + - "all_valid": enclose all valid values in quotes; nulls are not quoted. + - "none": do not enclose any values in quotes; values containing + special characters (such as quotes, cell delimiters or line endings) + will raise an error. + """ + + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, *, include_header=None, batch_size=None, + delimiter=None, quoting_style=None): + self.options.reset(new CCSVWriteOptions(CCSVWriteOptions.Defaults())) + if include_header is not None: + self.include_header = include_header + if batch_size is not None: + self.batch_size = batch_size + if delimiter is not None: + self.delimiter = delimiter + if quoting_style is not None: + self.quoting_style = quoting_style + + @property + def include_header(self): + """ + Whether to write an initial header line with column names. + """ + return deref(self.options).include_header + + @include_header.setter + def include_header(self, value): + deref(self.options).include_header = value + + @property + def batch_size(self): + """ + How many rows to process together when converting and writing + CSV data. + """ + return deref(self.options).batch_size + + @batch_size.setter + def batch_size(self, value): + deref(self.options).batch_size = value + + @property + def delimiter(self): + """ + The character delimiting individual cells in the CSV data. + """ + return chr(deref(self.options).delimiter) + + @delimiter.setter + def delimiter(self, value): + deref(self.options).delimiter = _single_char(value) + + @property + def quoting_style(self): + """ + Whether to quote values, and if so, which quoting style to use. + The following values are accepted: + + - "needed" (default): only enclose values in quotes when needed. + - "all_valid": enclose all valid values in quotes; nulls are not quoted. + - "none": do not enclose any values in quotes; values containing + special characters (such as quotes, cell delimiters or line endings) + will raise an error. + """ + return wrap_quoting_style(deref(self.options).quoting_style) + + @quoting_style.setter + def quoting_style(self, value): + deref(self.options).quoting_style = unwrap_quoting_style(value) + + @staticmethod + cdef WriteOptions wrap(CCSVWriteOptions options): + out = WriteOptions() + out.options.reset(new CCSVWriteOptions(move(options))) + return out + + def validate(self): + check_status(self.options.get().Validate()) + + +cdef _get_write_options(WriteOptions write_options, CCSVWriteOptions* out): + if write_options is None: + out[0] = CCSVWriteOptions.Defaults() + else: + out[0] = deref(write_options.options) + + +def write_csv(data, output_file, write_options=None, + MemoryPool memory_pool=None): + """ + Write record batch or table to a CSV file. + + Parameters + ---------- + data : pyarrow.RecordBatch or pyarrow.Table + The data to write. + output_file : string, path, pyarrow.NativeFile, or file-like object + The location where to write the CSV data. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + + Examples + -------- + + >>> import pyarrow as pa + >>> from pyarrow import csv + + >>> legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> entry_date = pa.array(["01/03/2022", "02/03/2022", + ... "03/03/2022", "04/03/2022"]) + >>> table = pa.table([animals, legs, entry_date], + ... names=["animals", "n_legs", "entry"]) + + >>> csv.write_csv(table, "animals.csv") + + >>> write_options = csv.WriteOptions(include_header=False) + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + + >>> write_options = csv.WriteOptions(delimiter=";") + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + """ + cdef: + shared_ptr[COutputStream] stream + CCSVWriteOptions c_write_options + CMemoryPool* c_memory_pool + CRecordBatch* batch + CTable* table + _get_write_options(write_options, &c_write_options) + + get_writer(output_file, &stream) + c_memory_pool = maybe_unbox_memory_pool(memory_pool) + c_write_options.io_context = CIOContext(c_memory_pool) + if isinstance(data, RecordBatch): + batch = pyarrow_unwrap_batch(data).get() + with nogil: + check_status(WriteCSV(deref(batch), c_write_options, stream.get())) + elif isinstance(data, Table): + table = pyarrow_unwrap_table(data).get() + with nogil: + check_status(WriteCSV(deref(table), c_write_options, stream.get())) + else: + raise TypeError(f"Expected Table or RecordBatch, got '{type(data)}'") + + +cdef class CSVWriter(_CRecordBatchWriter): + """ + Writer to create a CSV file. + + Parameters + ---------- + sink : str, path, pyarrow.OutputStream or file-like object + The location where to write the CSV data. + schema : pyarrow.Schema + The schema of the data to be written. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + """ + + def __init__(self, sink, Schema schema, *, + WriteOptions write_options=None, MemoryPool memory_pool=None): + cdef: + shared_ptr[COutputStream] c_stream + shared_ptr[CSchema] c_schema = pyarrow_unwrap_schema(schema) + CCSVWriteOptions c_write_options + CMemoryPool* c_memory_pool = maybe_unbox_memory_pool(memory_pool) + _get_write_options(write_options, &c_write_options) + c_write_options.io_context = CIOContext(c_memory_pool) + get_writer(sink, &c_stream) + with nogil: + self.writer = GetResultValue(MakeCSVWriter( + c_stream, c_schema, c_write_options)) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_cuda.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_cuda.pxd new file mode 100644 index 0000000000000000000000000000000000000000..6acb8826d1789ab2c9e5213f16f2851c9e3dc22b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_cuda.pxd @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.lib cimport * +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_cuda cimport * + + +cdef class Context(_Weakrefable): + cdef: + shared_ptr[CCudaContext] context + int device_number + + cdef void init(self, const shared_ptr[CCudaContext]& ctx) + + +cdef class IpcMemHandle(_Weakrefable): + cdef: + shared_ptr[CCudaIpcMemHandle] handle + + cdef void init(self, shared_ptr[CCudaIpcMemHandle]& h) + + +cdef class CudaBuffer(Buffer): + cdef: + shared_ptr[CCudaBuffer] cuda_buffer + object base + + cdef void init_cuda(self, + const shared_ptr[CCudaBuffer]& buffer, + object base) + + +cdef class HostBuffer(Buffer): + cdef: + shared_ptr[CCudaHostBuffer] host_buffer + + cdef void init_host(self, const shared_ptr[CCudaHostBuffer]& buffer) + + +cdef class BufferReader(NativeFile): + cdef: + CCudaBufferReader* reader + CudaBuffer buffer + + +cdef class BufferWriter(NativeFile): + cdef: + CCudaBufferWriter* writer + CudaBuffer buffer diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_cuda.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_cuda.pyx new file mode 100644 index 0000000000000000000000000000000000000000..5aed9f8a285188d4f3fa173cffa7d1188bc9006a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_cuda.pyx @@ -0,0 +1,1080 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from pyarrow.lib cimport * +from pyarrow.includes.libarrow_cuda cimport * +from pyarrow.lib import allocate_buffer, as_buffer, ArrowTypeError +from pyarrow.util import get_contiguous_span +cimport cpython as cp + + +cdef class Context(_Weakrefable): + """ + CUDA driver context. + """ + + def __init__(self, *args, **kwargs): + """ + Create a CUDA driver context for a particular device. + + If a CUDA context handle is passed, it is wrapped, otherwise + a default CUDA context for the given device is requested. + + Parameters + ---------- + device_number : int (default 0) + Specify the GPU device for which the CUDA driver context is + requested. + handle : int, optional + Specify CUDA handle for a shared context that has been created + by another library. + """ + # This method exposed because autodoc doesn't pick __cinit__ + + def __cinit__(self, int device_number=0, uintptr_t handle=0): + cdef CCudaDeviceManager* manager + manager = GetResultValue(CCudaDeviceManager.Instance()) + cdef int n = manager.num_devices() + if device_number >= n or device_number < 0: + self.context.reset() + raise ValueError('device_number argument must be ' + 'non-negative less than %s' % (n)) + if handle == 0: + self.context = GetResultValue(manager.GetContext(device_number)) + else: + self.context = GetResultValue(manager.GetSharedContext( + device_number, handle)) + self.device_number = device_number + + @staticmethod + def from_numba(context=None): + """ + Create a Context instance from a Numba CUDA context. + + Parameters + ---------- + context : {numba.cuda.cudadrv.driver.Context, None} + A Numba CUDA context instance. + If None, the current Numba context is used. + + Returns + ------- + shared_context : pyarrow.cuda.Context + Context instance. + """ + if context is None: + import numba.cuda + context = numba.cuda.current_context() + return Context(device_number=context.device.id, + handle=context.handle.value) + + def to_numba(self): + """ + Convert Context to a Numba CUDA context. + + Returns + ------- + context : numba.cuda.cudadrv.driver.Context + Numba CUDA context instance. + """ + import ctypes + import numba.cuda + device = numba.cuda.gpus[self.device_number] + handle = ctypes.c_void_p(self.handle) + context = numba.cuda.cudadrv.driver.Context(device, handle) + + class DummyPendingDeallocs(object): + # Context is managed by pyarrow + def add_item(self, *args, **kwargs): + pass + + context.deallocations = DummyPendingDeallocs() + return context + + @staticmethod + def get_num_devices(): + """ Return the number of GPU devices. + """ + cdef CCudaDeviceManager* manager + manager = GetResultValue(CCudaDeviceManager.Instance()) + return manager.num_devices() + + @property + def device_number(self): + """ Return context device number. + """ + return self.device_number + + @property + def handle(self): + """ Return pointer to context handle. + """ + return self.context.get().handle() + + cdef void init(self, const shared_ptr[CCudaContext]& ctx): + self.context = ctx + + def synchronize(self): + """Blocks until the device has completed all preceding requested + tasks. + """ + check_status(self.context.get().Synchronize()) + + @property + def bytes_allocated(self): + """Return the number of allocated bytes. + """ + return self.context.get().bytes_allocated() + + def get_device_address(self, uintptr_t address): + """Return the device address that is reachable from kernels running in + the context + + Parameters + ---------- + address : int + Specify memory address value + + Returns + ------- + device_address : int + Device address accessible from device context + + Notes + ----- + The device address is defined as a memory address accessible + by device. While it is often a device memory address but it + can be also a host memory address, for instance, when the + memory is allocated as host memory (using cudaMallocHost or + cudaHostAlloc) or as managed memory (using cudaMallocManaged) + or the host memory is page-locked (using cudaHostRegister). + """ + return GetResultValue(self.context.get().GetDeviceAddress(address)) + + def new_buffer(self, int64_t nbytes): + """Return new device buffer. + + Parameters + ---------- + nbytes : int + Specify the number of bytes to be allocated. + + Returns + ------- + buf : CudaBuffer + Allocated buffer. + """ + cdef: + shared_ptr[CCudaBuffer] cudabuf + with nogil: + cudabuf = GetResultValue(self.context.get().Allocate(nbytes)) + return pyarrow_wrap_cudabuffer(cudabuf) + + @property + def memory_manager(self): + """ + The default memory manager tied to this context's device. + + Returns + ------- + MemoryManager + """ + return MemoryManager.wrap(self.context.get().memory_manager()) + + @property + def device(self): + """ + The device instance associated with this context. + + Returns + ------- + Device + """ + return Device.wrap(self.context.get().device()) + + def foreign_buffer(self, address, size, base=None): + """ + Create device buffer from address and size as a view. + + The caller is responsible for allocating and freeing the + memory. When `address==size==0` then a new zero-sized buffer + is returned. + + Parameters + ---------- + address : int + Specify the starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + Specify the size of device buffer in bytes. + base : {None, object} + Specify object that owns the referenced memory. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device reachable memory. + + """ + if not address and size == 0: + return self.new_buffer(0) + cdef: + uintptr_t c_addr = self.get_device_address(address) + int64_t c_size = size + shared_ptr[CCudaBuffer] cudabuf + + cudabuf = GetResultValue(self.context.get().View( + c_addr, c_size)) + return pyarrow_wrap_cudabuffer_base(cudabuf, base) + + def open_ipc_buffer(self, ipc_handle): + """ Open existing CUDA IPC memory handle + + Parameters + ---------- + ipc_handle : IpcMemHandle + Specify opaque pointer to CUipcMemHandle (driver API). + + Returns + ------- + buf : CudaBuffer + referencing device buffer + """ + handle = pyarrow_unwrap_cudaipcmemhandle(ipc_handle) + cdef shared_ptr[CCudaBuffer] cudabuf + with nogil: + cudabuf = GetResultValue( + self.context.get().OpenIpcBuffer(handle.get()[0])) + return pyarrow_wrap_cudabuffer(cudabuf) + + def buffer_from_data(self, object data, int64_t offset=0, int64_t size=-1): + """Create device buffer and initialize with data. + + Parameters + ---------- + data : {CudaBuffer, HostBuffer, Buffer, array-like} + Specify data to be copied to device buffer. + offset : int + Specify the offset of input buffer for device data + buffering. Default: 0. + size : int + Specify the size of device buffer in bytes. Default: all + (starting from input offset) + + Returns + ------- + cbuf : CudaBuffer + Device buffer with copied data. + """ + is_host_data = not pyarrow_is_cudabuffer(data) + buf = as_buffer(data) if is_host_data else data + + bsize = buf.size + if offset < 0 or (bsize and offset >= bsize): + raise ValueError('offset argument is out-of-range') + if size < 0: + size = bsize - offset + elif offset + size > bsize: + raise ValueError( + 'requested larger slice than available in device buffer') + + if offset != 0 or size != bsize: + buf = buf.slice(offset, size) + + result = self.new_buffer(size) + if is_host_data: + result.copy_from_host(buf, position=0, nbytes=size) + else: + result.copy_from_device(buf, position=0, nbytes=size) + return result + + def buffer_from_object(self, obj): + """Create device buffer view of arbitrary object that references + device accessible memory. + + When the object contains a non-contiguous view of device + accessible memory then the returned device buffer will contain + contiguous view of the memory, that is, including the + intermediate data that is otherwise invisible to the input + object. + + Parameters + ---------- + obj : {object, Buffer, HostBuffer, CudaBuffer, ...} + Specify an object that holds (device or host) address that + can be accessed from device. This includes objects with + types defined in pyarrow.cuda as well as arbitrary objects + that implement the CUDA array interface as defined by numba. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device accessible memory. + + """ + if isinstance(obj, HostBuffer): + return self.foreign_buffer(obj.address, obj.size, base=obj) + elif isinstance(obj, Buffer): + return CudaBuffer.from_buffer(obj) + elif isinstance(obj, CudaBuffer): + return obj + elif hasattr(obj, '__cuda_array_interface__'): + desc = obj.__cuda_array_interface__ + addr = desc['data'][0] + if addr is None: + return self.new_buffer(0) + import numpy as np + start, end = get_contiguous_span( + desc['shape'], desc.get('strides'), + np.dtype(desc['typestr']).itemsize) + return self.foreign_buffer(addr + start, end - start, base=obj) + raise ArrowTypeError('cannot create device buffer view from' + ' `%s` object' % (type(obj))) + + +cdef class IpcMemHandle(_Weakrefable): + """A serializable container for a CUDA IPC handle. + """ + cdef void init(self, shared_ptr[CCudaIpcMemHandle]& h): + self.handle = h + + @staticmethod + def from_buffer(Buffer opaque_handle): + """Create IpcMemHandle from opaque buffer (e.g. from another + process) + + Parameters + ---------- + opaque_handle : + a CUipcMemHandle as a const void* + + Returns + ------- + ipc_handle : IpcMemHandle + """ + c_buf = pyarrow_unwrap_buffer(opaque_handle) + cdef: + shared_ptr[CCudaIpcMemHandle] handle + + handle = GetResultValue( + CCudaIpcMemHandle.FromBuffer(c_buf.get().data())) + return pyarrow_wrap_cudaipcmemhandle(handle) + + def serialize(self, pool=None): + """Write IpcMemHandle to a Buffer + + Parameters + ---------- + pool : {MemoryPool, None} + Specify a pool to allocate memory from + + Returns + ------- + buf : Buffer + The serialized buffer. + """ + cdef CMemoryPool* pool_ = maybe_unbox_memory_pool(pool) + cdef shared_ptr[CBuffer] buf + cdef CCudaIpcMemHandle* h = self.handle.get() + with nogil: + buf = GetResultValue(h.Serialize(pool_)) + return pyarrow_wrap_buffer(buf) + + +cdef class CudaBuffer(Buffer): + """An Arrow buffer with data located in a GPU device. + + To create a CudaBuffer instance, use Context.device_buffer(). + + The memory allocated in a CudaBuffer is freed when the buffer object + is deleted. + """ + + def __init__(self): + raise TypeError("Do not call CudaBuffer's constructor directly, use " + "`.device_buffer`" + " method instead.") + + cdef void init_cuda(self, + const shared_ptr[CCudaBuffer]& buffer, + object base): + self.cuda_buffer = buffer + self.init( buffer) + self.base = base + + @staticmethod + def from_buffer(buf): + """ Convert back generic buffer into CudaBuffer + + Parameters + ---------- + buf : Buffer + Specify buffer containing CudaBuffer + + Returns + ------- + dbuf : CudaBuffer + Resulting device buffer. + """ + c_buf = pyarrow_unwrap_buffer(buf) + cuda_buffer = GetResultValue(CCudaBuffer.FromBuffer(c_buf)) + return pyarrow_wrap_cudabuffer(cuda_buffer) + + @staticmethod + def from_numba(mem): + """Create a CudaBuffer view from numba MemoryPointer instance. + + Parameters + ---------- + mem : numba.cuda.cudadrv.driver.MemoryPointer + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of numba MemoryPointer. + """ + ctx = Context.from_numba(mem.context) + if mem.device_pointer.value is None and mem.size==0: + return ctx.new_buffer(0) + return ctx.foreign_buffer(mem.device_pointer.value, mem.size, base=mem) + + def to_numba(self): + """Return numba memory pointer of CudaBuffer instance. + """ + import ctypes + from numba.cuda.cudadrv.driver import MemoryPointer + return MemoryPointer(self.context.to_numba(), + pointer=ctypes.c_void_p(self.address), + size=self.size) + + cdef getitem(self, int64_t i): + return self.copy_to_host(position=i, nbytes=1)[0] + + def copy_to_host(self, int64_t position=0, int64_t nbytes=-1, + Buffer buf=None, + MemoryPool memory_pool=None, c_bool resizable=False): + """Copy memory from GPU device to CPU host + + Caller is responsible for ensuring that all tasks affecting + the memory are finished. Use + + `.context.synchronize()` + + when needed. + + Parameters + ---------- + position : int + Specify the starting position of the source data in GPU + device buffer. Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + the position until host buffer is full). + buf : Buffer + Specify a pre-allocated output buffer in host. Default: None + (allocate new output buffer). + memory_pool : MemoryPool + resizable : bool + Specify extra arguments to allocate_buffer. Used only when + buf is None. + + Returns + ------- + buf : Buffer + Output buffer in host. + + """ + if position < 0 or (self.size and position > self.size) \ + or (self.size == 0 and position != 0): + raise ValueError('position argument is out-of-range') + cdef: + int64_t c_nbytes + if buf is None: + if nbytes < 0: + # copy all starting from position to new host buffer + c_nbytes = self.size - position + else: + if nbytes > self.size - position: + raise ValueError( + 'requested more to copy than available from ' + 'device buffer') + # copy nbytes starting from position to new host buffer + c_nbytes = nbytes + buf = allocate_buffer(c_nbytes, memory_pool=memory_pool, + resizable=resizable) + else: + if nbytes < 0: + # copy all from position until given host buffer is full + c_nbytes = min(self.size - position, buf.size) + else: + if nbytes > buf.size: + raise ValueError( + 'requested copy does not fit into host buffer') + # copy nbytes from position to given host buffer + c_nbytes = nbytes + + cdef: + shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(buf) + int64_t c_position = position + with nogil: + check_status(self.cuda_buffer.get() + .CopyToHost(c_position, c_nbytes, + c_buf.get().mutable_data())) + return buf + + def copy_from_host(self, data, int64_t position=0, int64_t nbytes=-1): + """Copy data from host to device. + + The device buffer must be pre-allocated. + + Parameters + ---------- + data : {Buffer, array-like} + Specify data in host. It can be array-like that is valid + argument to py_buffer + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + """ + if position < 0 or position > self.size: + raise ValueError('position argument is out-of-range') + cdef: + int64_t c_nbytes + buf = as_buffer(data) + + if nbytes < 0: + # copy from host buffer to device buffer starting from + # position until device buffer is full + c_nbytes = min(self.size - position, buf.size) + else: + if nbytes > buf.size: + raise ValueError( + 'requested more to copy than available from host buffer') + if nbytes > self.size - position: + raise ValueError( + 'requested more to copy than available in device buffer') + # copy nbytes from host buffer to device buffer starting + # from position + c_nbytes = nbytes + + cdef: + shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(buf) + int64_t c_position = position + with nogil: + check_status(self.cuda_buffer.get(). + CopyFromHost(c_position, c_buf.get().data(), + c_nbytes)) + return c_nbytes + + def copy_from_device(self, buf, int64_t position=0, int64_t nbytes=-1): + """Copy data from device to device. + + Parameters + ---------- + buf : CudaBuffer + Specify source device buffer. + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + + """ + if position < 0 or position > self.size: + raise ValueError('position argument is out-of-range') + cdef: + int64_t c_nbytes + + if nbytes < 0: + # copy from source device buffer to device buffer starting + # from position until device buffer is full + c_nbytes = min(self.size - position, buf.size) + else: + if nbytes > buf.size: + raise ValueError( + 'requested more to copy than available from device buffer') + if nbytes > self.size - position: + raise ValueError( + 'requested more to copy than available in device buffer') + # copy nbytes from source device buffer to device buffer + # starting from position + c_nbytes = nbytes + + cdef: + shared_ptr[CCudaBuffer] c_buf = pyarrow_unwrap_cudabuffer(buf) + int64_t c_position = position + shared_ptr[CCudaContext] c_src_ctx = pyarrow_unwrap_cudacontext( + buf.context) + void* c_source_data = (c_buf.get().address()) + + if self.context.handle != buf.context.handle: + with nogil: + check_status(self.cuda_buffer.get(). + CopyFromAnotherDevice(c_src_ctx, c_position, + c_source_data, c_nbytes)) + else: + with nogil: + check_status(self.cuda_buffer.get(). + CopyFromDevice(c_position, c_source_data, + c_nbytes)) + return c_nbytes + + def export_for_ipc(self): + """ + Expose this device buffer as IPC memory which can be used in other + processes. + + After calling this function, this device memory will not be + freed when the CudaBuffer is destructed. + + Returns + ------- + ipc_handle : IpcMemHandle + The exported IPC handle + + """ + cdef shared_ptr[CCudaIpcMemHandle] handle + with nogil: + handle = GetResultValue(self.cuda_buffer.get().ExportForIpc()) + return pyarrow_wrap_cudaipcmemhandle(handle) + + @property + def context(self): + """Returns the CUDA driver context of this buffer. + """ + return pyarrow_wrap_cudacontext(self.cuda_buffer.get().context()) + + def slice(self, offset=0, length=None): + """Return slice of device buffer + + Parameters + ---------- + offset : int, default 0 + Specify offset from the start of device buffer to slice + length : int, default None + Specify the length of slice (default is until end of device + buffer starting from offset). If the length is larger than + the data available, the returned slice will have a size of + the available data starting from the offset. + + Returns + ------- + sliced : CudaBuffer + Zero-copy slice of device buffer. + + """ + if offset < 0 or (self.size and offset >= self.size): + raise ValueError('offset argument is out-of-range') + cdef int64_t offset_ = offset + cdef int64_t size + if length is None: + size = self.size - offset_ + elif offset + length <= self.size: + size = length + else: + size = self.size - offset + parent = pyarrow_unwrap_cudabuffer(self) + return pyarrow_wrap_cudabuffer(make_shared[CCudaBuffer](parent, + offset_, size)) + + def to_pybytes(self): + """Return device buffer content as Python bytes. + """ + return self.copy_to_host().to_pybytes() + + def __getbuffer__(self, cp.Py_buffer* buffer, int flags): + # Device buffer contains data pointers on the device. Hence, + # cannot support buffer protocol PEP-3118 for CudaBuffer. + raise BufferError('buffer protocol for device buffer not supported') + + +cdef class HostBuffer(Buffer): + """Device-accessible CPU memory created using cudaHostAlloc. + + To create a HostBuffer instance, use + + cuda.new_host_buffer() + """ + + def __init__(self): + raise TypeError("Do not call HostBuffer's constructor directly," + " use `cuda.new_host_buffer` function instead.") + + cdef void init_host(self, const shared_ptr[CCudaHostBuffer]& buffer): + self.host_buffer = buffer + self.init( buffer) + + @property + def size(self): + return self.host_buffer.get().size() + + +cdef class BufferReader(NativeFile): + """File interface for zero-copy read from CUDA buffers. + + Note: Read methods return pointers to device memory. This means + you must be careful using this interface with any Arrow code which + may expect to be able to do anything other than pointer arithmetic + on the returned buffers. + """ + + def __cinit__(self, CudaBuffer obj): + self.buffer = obj + self.reader = new CCudaBufferReader(self.buffer.buffer) + self.set_random_access_file( + shared_ptr[CRandomAccessFile](self.reader)) + self.is_readable = True + + def read_buffer(self, nbytes=None): + """Return a slice view of the underlying device buffer. + + The slice will start at the current reader position and will + have specified size in bytes. + + Parameters + ---------- + nbytes : int, default None + Specify the number of bytes to read. Default: None (read all + remaining bytes). + + Returns + ------- + cbuf : CudaBuffer + New device buffer. + + """ + cdef: + int64_t c_nbytes + shared_ptr[CCudaBuffer] output + + if nbytes is None: + c_nbytes = self.size() - self.tell() + else: + c_nbytes = nbytes + + with nogil: + output = static_pointer_cast[CCudaBuffer, CBuffer]( + GetResultValue(self.reader.Read(c_nbytes))) + + return pyarrow_wrap_cudabuffer(output) + + +cdef class BufferWriter(NativeFile): + """File interface for writing to CUDA buffers. + + By default writes are unbuffered. Use set_buffer_size to enable + buffering. + """ + + def __cinit__(self, CudaBuffer buffer): + self.buffer = buffer + self.writer = new CCudaBufferWriter(self.buffer.cuda_buffer) + self.set_output_stream(shared_ptr[COutputStream](self.writer)) + self.is_writable = True + + def writeat(self, int64_t position, object data): + """Write data to buffer starting from position. + + Parameters + ---------- + position : int + Specify device buffer position where the data will be + written. + data : array-like + Specify data, the data instance must implement buffer + protocol. + """ + cdef: + Buffer buf = as_buffer(data) + const uint8_t* c_data = buf.buffer.get().data() + int64_t c_size = buf.buffer.get().size() + + with nogil: + check_status(self.writer.WriteAt(position, c_data, c_size)) + + def flush(self): + """ Flush the buffer stream """ + with nogil: + check_status(self.writer.Flush()) + + def seek(self, int64_t position, int whence=0): + # TODO: remove this method after NativeFile.seek supports + # writable files. + cdef int64_t offset + + with nogil: + if whence == 0: + offset = position + elif whence == 1: + offset = GetResultValue(self.writer.Tell()) + offset = offset + position + else: + with gil: + raise ValueError("Invalid value of whence: {0}" + .format(whence)) + check_status(self.writer.Seek(offset)) + return self.tell() + + @property + def buffer_size(self): + """Returns size of host (CPU) buffer, 0 for unbuffered + """ + return self.writer.buffer_size() + + @buffer_size.setter + def buffer_size(self, int64_t buffer_size): + """Set CPU buffer size to limit calls to cudaMemcpy + + Parameters + ---------- + buffer_size : int + Specify the size of CPU buffer to allocate in bytes. + """ + with nogil: + check_status(self.writer.SetBufferSize(buffer_size)) + + @property + def num_bytes_buffered(self): + """Returns number of bytes buffered on host + """ + return self.writer.num_bytes_buffered() + +# Functions + + +def new_host_buffer(const int64_t size, int device=0): + """Return buffer with CUDA-accessible memory on CPU host + + Parameters + ---------- + size : int + Specify the number of bytes to be allocated. + device : int + Specify GPU device number. + + Returns + ------- + dbuf : HostBuffer + Allocated host buffer + """ + cdef shared_ptr[CCudaHostBuffer] buffer + with nogil: + buffer = GetResultValue(AllocateCudaHostBuffer(device, size)) + return pyarrow_wrap_cudahostbuffer(buffer) + + +def serialize_record_batch(object batch, object ctx): + """ Write record batch message to GPU device memory + + Parameters + ---------- + batch : RecordBatch + Record batch to write + ctx : Context + CUDA Context to allocate device memory from + + Returns + ------- + dbuf : CudaBuffer + device buffer which contains the record batch message + """ + cdef shared_ptr[CCudaBuffer] buffer + cdef CRecordBatch* batch_ = pyarrow_unwrap_batch(batch).get() + cdef CCudaContext* ctx_ = pyarrow_unwrap_cudacontext(ctx).get() + with nogil: + buffer = GetResultValue(CudaSerializeRecordBatch(batch_[0], ctx_)) + return pyarrow_wrap_cudabuffer(buffer) + + +def read_message(object source, pool=None): + """ Read Arrow IPC message located on GPU device + + Parameters + ---------- + source : {CudaBuffer, cuda.BufferReader} + Device buffer or reader of device buffer. + pool : MemoryPool (optional) + Pool to allocate CPU memory for the metadata + + Returns + ------- + message : Message + The deserialized message, body still on device + """ + cdef: + Message result = Message.__new__(Message) + cdef CMemoryPool* pool_ = maybe_unbox_memory_pool(pool) + if not isinstance(source, BufferReader): + reader = BufferReader(source) + with nogil: + result.message = move( + GetResultValue(ReadMessage(reader.reader, pool_))) + return result + + +def read_record_batch(object buffer, object schema, *, + DictionaryMemo dictionary_memo=None, pool=None): + """Construct RecordBatch referencing IPC message located on CUDA device. + + While the metadata is copied to host memory for deserialization, + the record batch data remains on the device. + + Parameters + ---------- + buffer : + Device buffer containing the complete IPC message + schema : Schema + The schema for the record batch + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + pool : MemoryPool (optional) + Pool to allocate metadata from + + Returns + ------- + batch : RecordBatch + Reconstructed record batch, with device pointers + + """ + cdef: + shared_ptr[CSchema] schema_ = pyarrow_unwrap_schema(schema) + shared_ptr[CCudaBuffer] buffer_ = pyarrow_unwrap_cudabuffer(buffer) + CDictionaryMemo temp_memo + CDictionaryMemo* arg_dict_memo + CMemoryPool* pool_ = maybe_unbox_memory_pool(pool) + shared_ptr[CRecordBatch] batch + + if dictionary_memo is not None: + arg_dict_memo = dictionary_memo.memo + else: + arg_dict_memo = &temp_memo + + with nogil: + batch = GetResultValue(CudaReadRecordBatch( + schema_, arg_dict_memo, buffer_, pool_)) + return pyarrow_wrap_batch(batch) + + +# Public API + + +cdef public api bint pyarrow_is_buffer(object buffer): + return isinstance(buffer, Buffer) + +# cudabuffer + +cdef public api bint pyarrow_is_cudabuffer(object buffer): + return isinstance(buffer, CudaBuffer) + + +cdef public api object \ + pyarrow_wrap_cudabuffer_base(const shared_ptr[CCudaBuffer]& buf, base): + cdef CudaBuffer result = CudaBuffer.__new__(CudaBuffer) + result.init_cuda(buf, base) + return result + + +cdef public api object \ + pyarrow_wrap_cudabuffer(const shared_ptr[CCudaBuffer]& buf): + cdef CudaBuffer result = CudaBuffer.__new__(CudaBuffer) + result.init_cuda(buf, None) + return result + + +cdef public api shared_ptr[CCudaBuffer] pyarrow_unwrap_cudabuffer(object obj): + if pyarrow_is_cudabuffer(obj): + return (obj).cuda_buffer + raise TypeError('expected CudaBuffer instance, got %s' + % (type(obj).__name__)) + +# cudahostbuffer + +cdef public api bint pyarrow_is_cudahostbuffer(object buffer): + return isinstance(buffer, HostBuffer) + + +cdef public api object \ + pyarrow_wrap_cudahostbuffer(const shared_ptr[CCudaHostBuffer]& buf): + cdef HostBuffer result = HostBuffer.__new__(HostBuffer) + result.init_host(buf) + return result + + +cdef public api shared_ptr[CCudaHostBuffer] \ + pyarrow_unwrap_cudahostbuffer(object obj): + if pyarrow_is_cudahostbuffer(obj): + return (obj).host_buffer + raise TypeError('expected HostBuffer instance, got %s' + % (type(obj).__name__)) + +# cudacontext + +cdef public api bint pyarrow_is_cudacontext(object ctx): + return isinstance(ctx, Context) + + +cdef public api object \ + pyarrow_wrap_cudacontext(const shared_ptr[CCudaContext]& ctx): + cdef Context result = Context.__new__(Context) + result.init(ctx) + return result + + +cdef public api shared_ptr[CCudaContext] \ + pyarrow_unwrap_cudacontext(object obj): + if pyarrow_is_cudacontext(obj): + return (obj).context + raise TypeError('expected Context instance, got %s' + % (type(obj).__name__)) + +# cudaipcmemhandle + +cdef public api bint pyarrow_is_cudaipcmemhandle(object handle): + return isinstance(handle, IpcMemHandle) + + +cdef public api object \ + pyarrow_wrap_cudaipcmemhandle(shared_ptr[CCudaIpcMemHandle]& h): + cdef IpcMemHandle result = IpcMemHandle.__new__(IpcMemHandle) + result.init(h) + return result + + +cdef public api shared_ptr[CCudaIpcMemHandle] \ + pyarrow_unwrap_cudaipcmemhandle(object obj): + if pyarrow_is_cudaipcmemhandle(obj): + return (obj).handle + raise TypeError('expected IpcMemHandle instance, got %s' + % (type(obj).__name__)) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..67a9a6cd1a06b31a59e31790c5e2d96ab700445e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.cpython-312-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c8355d3504b46bf95f02534ac8c61bec5e6c496086f6d8e985b56f53b057b24 +size 1069560 diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.pxd new file mode 100644 index 0000000000000000000000000000000000000000..220ab6b19affe6b520db3a3501fad2772919f5e4 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.pxd @@ -0,0 +1,183 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset is currently unstable. APIs subject to change without notice.""" + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow_dataset cimport * +from pyarrow.lib cimport * +from pyarrow._fs cimport FileSystem, FileInfo + + +cdef CFileSource _make_file_source(object file, FileSystem filesystem=*, object file_size=*) + +cdef class DatasetFactory(_Weakrefable): + + cdef: + SharedPtrNoGIL[CDatasetFactory] wrapped + CDatasetFactory* factory + + cdef init(self, const shared_ptr[CDatasetFactory]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CDatasetFactory]& sp) + + cdef inline shared_ptr[CDatasetFactory] unwrap(self) nogil + + +cdef class Dataset(_Weakrefable): + + cdef: + SharedPtrNoGIL[CDataset] wrapped + CDataset* dataset + public dict _scan_options + + cdef void init(self, const shared_ptr[CDataset]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CDataset]& sp) + + cdef shared_ptr[CDataset] unwrap(self) nogil + + +cdef class Scanner(_Weakrefable): + cdef: + SharedPtrNoGIL[CScanner] wrapped + CScanner* scanner + + cdef void init(self, const shared_ptr[CScanner]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CScanner]& sp) + + cdef shared_ptr[CScanner] unwrap(self) + + @staticmethod + cdef shared_ptr[CScanOptions] _make_scan_options(Dataset dataset, dict py_scanoptions) except * + + +cdef class FragmentScanOptions(_Weakrefable): + + cdef: + shared_ptr[CFragmentScanOptions] wrapped + + cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CFragmentScanOptions]& sp) + + +cdef class FileFormat(_Weakrefable): + + cdef: + shared_ptr[CFileFormat] wrapped + CFileFormat* format + + cdef void init(self, const shared_ptr[CFileFormat]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CFileFormat]& sp) + + cdef inline shared_ptr[CFileFormat] unwrap(self) + + cdef _set_default_fragment_scan_options(self, FragmentScanOptions options) + + # Return a WrittenFile after a file was written. + # May be overridden by subclasses, e.g. to add metadata. + cdef WrittenFile _finish_write(self, path, base_dir, + CFileWriter* file_writer) + + +cdef class FileWriteOptions(_Weakrefable): + + cdef: + shared_ptr[CFileWriteOptions] wrapped + CFileWriteOptions* c_options + + cdef void init(self, const shared_ptr[CFileWriteOptions]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CFileWriteOptions]& sp) + + cdef inline shared_ptr[CFileWriteOptions] unwrap(self) + + +cdef class Fragment(_Weakrefable): + + cdef: + SharedPtrNoGIL[CFragment] wrapped + CFragment* fragment + + cdef void init(self, const shared_ptr[CFragment]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CFragment]& sp) + + cdef inline shared_ptr[CFragment] unwrap(self) + + +cdef class FileFragment(Fragment): + + cdef: + CFileFragment* file_fragment + + cdef void init(self, const shared_ptr[CFragment]& sp) + + +cdef class Partitioning(_Weakrefable): + + cdef: + shared_ptr[CPartitioning] wrapped + CPartitioning* partitioning + + cdef init(self, const shared_ptr[CPartitioning]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CPartitioning]& sp) + + cdef inline shared_ptr[CPartitioning] unwrap(self) + + +cdef class PartitioningFactory(_Weakrefable): + + cdef: + shared_ptr[CPartitioningFactory] wrapped + CPartitioningFactory* factory + object constructor + object options + + cdef init(self, const shared_ptr[CPartitioningFactory]& sp) + + @staticmethod + cdef wrap(const shared_ptr[CPartitioningFactory]& sp, + object constructor, object options) + + cdef inline shared_ptr[CPartitioningFactory] unwrap(self) + + +cdef class WrittenFile(_Weakrefable): + + # The full path to the created file + cdef public str path + # Optional Parquet metadata + # This metadata will have the file path attribute set to the path of + # the written file. + cdef public object metadata + # The size of the file in bytes + cdef public int64_t size diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.pyx new file mode 100644 index 0000000000000000000000000000000000000000..6b5259f499f0547c09883ed46ff7ae40aa7f0d01 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset.pyx @@ -0,0 +1,4089 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset is currently unstable. APIs subject to change without notice.""" + +from cython.operator cimport dereference as deref + +import codecs +import collections +from libcpp cimport bool + +import pyarrow as pa +from pyarrow.lib cimport * +from pyarrow.lib import ArrowTypeError, frombytes, tobytes, _pac +from pyarrow.includes.libarrow_dataset cimport * +from pyarrow._acero cimport ExecNodeOptions +from pyarrow._compute cimport Expression, _bind +from pyarrow._compute import _forbid_instantiation +from pyarrow._fs cimport FileSystem, FileSelector, FileInfo +from pyarrow._csv cimport ( + ConvertOptions, ParseOptions, ReadOptions, WriteOptions) +from pyarrow.util import _is_iterable, _is_path_like, _stringify_path +from pyarrow._json cimport ParseOptions as JsonParseOptions +from pyarrow._json cimport ReadOptions as JsonReadOptions + + +_DEFAULT_BATCH_SIZE = 2**17 +_DEFAULT_BATCH_READAHEAD = 16 +_DEFAULT_FRAGMENT_READAHEAD = 4 + + +# Initialise support for Datasets in ExecPlan +Initialize() + + +_orc_fileformat = None +_orc_imported = False + + +def _get_orc_fileformat(): + """ + Import OrcFileFormat on first usage (to avoid circular import issue + when `pyarrow._dataset_orc` would be imported first) + """ + global _orc_fileformat + global _orc_imported + if not _orc_imported: + try: + from pyarrow._dataset_orc import OrcFileFormat + _orc_fileformat = OrcFileFormat + except ImportError as e: + _orc_fileformat = None + finally: + _orc_imported = True + return _orc_fileformat + + +_dataset_pq = False + + +def _get_parquet_classes(): + """ + Import Parquet class files on first usage (to avoid circular import issue + when `pyarrow._dataset_parquet` would be imported first) + """ + global _dataset_pq + if _dataset_pq is False: + try: + import pyarrow._dataset_parquet as _dataset_pq + except ImportError: + _dataset_pq = None + + +def _get_parquet_symbol(name): + """ + Get a symbol from pyarrow.parquet if the latter is importable, otherwise + return None. + """ + _get_parquet_classes() + return _dataset_pq and getattr(_dataset_pq, name) + + +cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, object file_size=None): + + cdef: + CFileSource c_source + shared_ptr[CFileSystem] c_filesystem + CFileInfo c_info + c_string c_path + shared_ptr[CRandomAccessFile] c_file + shared_ptr[CBuffer] c_buffer + int64_t c_size + + if isinstance(file, Buffer): + c_buffer = pyarrow_unwrap_buffer(file) + c_source = CFileSource(move(c_buffer)) + elif _is_path_like(file): + if filesystem is None: + raise ValueError("cannot construct a FileSource from " + "a path without a FileSystem") + c_filesystem = filesystem.unwrap() + c_path = tobytes(_stringify_path(file)) + + if file_size is not None: + c_size = file_size + c_info = FileInfo(c_path, size=c_size).unwrap() + c_source = CFileSource(move(c_info), move(c_filesystem)) + else: + c_source = CFileSource(move(c_path), move(c_filesystem)) + elif hasattr(file, 'read'): + # Optimistically hope this is file-like + c_file = get_native_file(file, False).get_random_access_file() + c_source = CFileSource(move(c_file)) + + else: + raise TypeError("cannot construct a FileSource " + "from " + str(file)) + + return c_source + + +cdef CSegmentEncoding _get_segment_encoding(str segment_encoding): + if segment_encoding == "none": + return CSegmentEncoding_None + elif segment_encoding == "uri": + return CSegmentEncoding_Uri + raise ValueError(f"Unknown segment encoding: {segment_encoding}") + + +cdef str _wrap_segment_encoding(CSegmentEncoding segment_encoding): + if segment_encoding == CSegmentEncoding_None: + return "none" + elif segment_encoding == CSegmentEncoding_Uri: + return "uri" + raise ValueError("Unknown segment encoding") + + +cdef Expression _true = Expression._scalar(True) + + +cdef class Dataset(_Weakrefable): + """ + Collection of data fragments and potentially child datasets. + + Arrow Datasets allow you to query against data that has been split across + multiple files. This sharding of data may indicate partitioning, which + can accelerate queries that only touch some partitions (files). + """ + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef void init(self, const shared_ptr[CDataset]& sp): + self.wrapped = sp + self.dataset = sp.get() + self._scan_options = dict() + + @staticmethod + cdef wrap(const shared_ptr[CDataset]& sp): + type_name = frombytes(sp.get().type_name()) + + classes = { + 'union': UnionDataset, + 'filesystem': FileSystemDataset, + 'in-memory': InMemoryDataset, + } + + class_ = classes.get(type_name, None) + if class_ is None: + raise TypeError(type_name) + + cdef Dataset self = class_.__new__(class_) + self.init(sp) + return self + + cdef shared_ptr[CDataset] unwrap(self) nogil: + return self.wrapped + + @property + def partition_expression(self): + """ + An Expression which evaluates to true for all data viewed by this + Dataset. + """ + return Expression.wrap(self.dataset.partition_expression()) + + def replace_schema(self, Schema schema not None): + """ + Return a copy of this Dataset with a different schema. + + The copy will view the same Fragments. If the new schema is not + compatible with the original dataset's schema then an error will + be raised. + + Parameters + ---------- + schema : Schema + The new dataset schema. + """ + cdef shared_ptr[CDataset] copy = GetResultValue( + self.dataset.ReplaceSchema(pyarrow_unwrap_schema(schema)) + ) + + d = Dataset.wrap(move(copy)) + if self._scan_options: + # Preserve scan options if set. + d._scan_options = self._scan_options.copy() + return d + + def get_fragments(self, Expression filter=None): + """Returns an iterator over the fragments in this dataset. + + Parameters + ---------- + filter : Expression, default None + Return fragments matching the optional filter, either using the + partition_expression or internal information like Parquet's + statistics. + + Returns + ------- + fragments : iterator of Fragment + """ + if self._scan_options.get("filter") is not None: + # Accessing fragments of a filtered dataset is not supported. + # It would be unclear if you wanted to filter the fragments + # or the rows in those fragments. + raise ValueError( + "Retrieving fragments of a filtered or projected " + "dataset is not allowed. Remove the filtering." + ) + + return self._get_fragments(filter) + + def _get_fragments(self, Expression filter): + cdef: + CExpression c_filter + + if filter is None: + c_fragments = move(GetResultValue(self.dataset.GetFragments())) + else: + c_filter = _bind(filter, self.schema) + c_fragments = move(GetResultValue( + self.dataset.GetFragments(c_filter))) + + for maybe_fragment in c_fragments: + yield Fragment.wrap(GetResultValue(move(maybe_fragment))) + + def _scanner_options(self, options): + """Returns the default options to create a new Scanner. + + This is automatically invoked by :meth:`Dataset.scanner` + and there is no need to use it. + """ + new_options = options.copy() + + # at the moment only support filter + requested_filter = options.get("filter") + current_filter = self._scan_options.get("filter") + if requested_filter is not None and current_filter is not None: + new_options["filter"] = current_filter & requested_filter + elif current_filter is not None: + new_options["filter"] = current_filter + + return new_options + + def scanner(self, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Build a scan operation against the dataset. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + See the :meth:`Scanner.from_dataset` method for further information. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], + ... 'n_legs': [2, 2, 4, 4, 5, 100], + ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", + ... "Brittle stars", "Centipede"]}) + >>> + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "dataset_scanner.parquet") + + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset("dataset_scanner.parquet") + + Selecting a subset of the columns: + + >>> dataset.scanner(columns=["year", "n_legs"]).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2021,2022,2019,2021]] + n_legs: [[2,2,4,4,5,100]] + + Projecting selected columns using an expression: + + >>> dataset.scanner(columns={ + ... "n_legs_uint": ds.field("n_legs").cast("uint8"), + ... }).to_table() + pyarrow.Table + n_legs_uint: uint8 + ---- + n_legs_uint: [[2,2,4,4,5,100]] + + Filtering rows while scanning: + + >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2022,2021,2022,2021]] + n_legs: [[2,4,4,100]] + animal: [["Parrot","Dog","Horse","Centipede"]] + """ + return Scanner.from_dataset( + self, + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ) + + def to_batches(self, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Read the dataset as materialized record batches. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + return self.scanner( + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).to_batches() + + def to_table(self, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Read the dataset to an Arrow table. + + Note that this method reads all the selected data from the dataset + into memory. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + return self.scanner( + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).to_table() + + def take(self, + object indices, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + return self.scanner( + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).take(indices) + + def head(self, + int num_rows, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + return self.scanner( + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).head(num_rows) + + def count_rows(self, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + return self.scanner( + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).count_rows() + + @property + def schema(self): + """The common schema of the full Dataset""" + return pyarrow_wrap_schema(self.dataset.schema()) + + def filter(self, expression not None): + """ + Apply a row filter to the dataset. + + Parameters + ---------- + expression : Expression + The filter that should be applied to the dataset. + + Returns + ------- + Dataset + """ + cdef: + Dataset filtered_dataset + + new_filter = expression + current_filter = self._scan_options.get("filter") + if current_filter is not None and new_filter is not None: + new_filter = current_filter & new_filter + + filtered_dataset = self.__class__.__new__(self.__class__) + filtered_dataset.init(self.wrapped) + filtered_dataset._scan_options = dict(filter=new_filter) + return filtered_dataset + + def sort_by(self, sorting, **kwargs): + """ + Sort the Dataset by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + InMemoryDataset + A new dataset sorted according to the sort keys. + """ + if isinstance(sorting, str): + sorting = [(sorting, "ascending")] + + res = _pac()._sort_source( + self, output_type=InMemoryDataset, sort_keys=sorting, **kwargs + ) + return res + + def join(self, right_dataset, keys, right_keys=None, join_type="left outer", + left_suffix=None, right_suffix=None, coalesce_keys=True, + use_threads=True): + """ + Perform a join between this dataset and another one. + + Result of the join will be a new dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + keys : str or list[str] + The columns from current dataset that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_dataset that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to right column names. This prevents confusion + when the columns in left and right datasets have colliding names. + right_suffix : str, default None + Which suffix to add to the left column names. This prevents confusion + when the columns in left and right datasets have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whenever to use multithreading or not. + + Returns + ------- + InMemoryDataset + """ + if right_keys is None: + right_keys = keys + return _pac()._perform_join( + join_type, self, keys, right_dataset, right_keys, + left_suffix=left_suffix, right_suffix=right_suffix, + use_threads=use_threads, coalesce_keys=coalesce_keys, + output_type=InMemoryDataset + ) + + def join_asof(self, right_dataset, on, by, tolerance, right_on=None, right_by=None): + """ + Perform an asof join between this dataset and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both datasets must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + on : str + The column from current dataset that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input table must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current dataset that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row `right.on - left.on <= tolerance`. The + `tolerance` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_dataset that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left dataset. + right_by : str or list[str], default None + The columns from the right_dataset that should be used as by keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + + Returns + ------- + InMemoryDataset + """ + if right_on is None: + right_on = on + if right_by is None: + right_by = by + return _pac()._perform_join_asof(self, on, by, + right_dataset, right_on, right_by, + tolerance, output_type=InMemoryDataset) + + +cdef class InMemoryDataset(Dataset): + """ + A Dataset wrapping in-memory data. + + Parameters + ---------- + source : RecordBatch, Table, list, tuple + The data for this dataset. Can be a RecordBatch, Table, list of + RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader + If an iterable is provided, the schema must also be provided. + schema : Schema, optional + Only required if passing an iterable as the source + """ + + cdef: + CInMemoryDataset* in_memory_dataset + + def __init__(self, source, Schema schema=None): + cdef: + shared_ptr[CInMemoryDataset] in_memory_dataset + + if isinstance(source, (pa.RecordBatch, pa.Table)): + source = [source] + + if isinstance(source, (list, tuple)): + batches = [] + for item in source: + if isinstance(item, pa.RecordBatch): + batches.append(item) + elif isinstance(item, pa.Table): + batches.extend(item.to_batches()) + else: + raise TypeError( + 'Expected a list of tables or batches. The given list ' + 'contains a ' + type(item).__name__) + if schema is None: + schema = item.schema + elif not schema.equals(item.schema): + raise ArrowTypeError( + f'Item has schema\n{item.schema}\nwhich does not ' + f'match expected schema\n{schema}') + if not batches and schema is None: + raise ValueError('Must provide schema to construct in-memory ' + 'dataset from an empty list') + table = pa.Table.from_batches(batches, schema=schema) + in_memory_dataset = make_shared[CInMemoryDataset]( + pyarrow_unwrap_table(table)) + else: + raise TypeError( + 'Expected a table, batch, or list of tables/batches ' + 'instead of the given type: ' + + type(source).__name__ + ) + + self.init( in_memory_dataset) + + cdef void init(self, const shared_ptr[CDataset]& sp): + Dataset.init(self, sp) + self.in_memory_dataset = sp.get() + + +cdef class UnionDataset(Dataset): + """ + A Dataset wrapping child datasets. + + Children's schemas must agree with the provided schema. + + Parameters + ---------- + schema : Schema + A known schema to conform to. + children : list of Dataset + One or more input children + """ + + cdef: + CUnionDataset* union_dataset + + def __init__(self, Schema schema not None, children): + cdef: + Dataset child + CDatasetVector c_children + shared_ptr[CUnionDataset] union_dataset + + for child in children: + c_children.push_back(child.wrapped) + + union_dataset = GetResultValue(CUnionDataset.Make( + pyarrow_unwrap_schema(schema), move(c_children))) + self.init( union_dataset) + + cdef void init(self, const shared_ptr[CDataset]& sp): + Dataset.init(self, sp) + self.union_dataset = sp.get() + + def __reduce__(self): + return UnionDataset, (self.schema, self.children) + + @property + def children(self): + cdef CDatasetVector children = self.union_dataset.children() + return [Dataset.wrap(children[i]) for i in range(children.size())] + + +cdef class FileSystemDataset(Dataset): + """ + A Dataset of file fragments. + + A FileSystemDataset is composed of one or more FileFragment. + + Parameters + ---------- + fragments : list[Fragments] + List of fragments to consume. + schema : Schema + The top-level schema of the Dataset. + format : FileFormat + File format of the fragments, currently only ParquetFileFormat, + IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + FileSystem of the fragments. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + + cdef: + CFileSystemDataset* filesystem_dataset + + def __init__(self, fragments, Schema schema, FileFormat format, + FileSystem filesystem=None, root_partition=None): + cdef: + FileFragment fragment=None + vector[shared_ptr[CFileFragment]] c_fragments + CResult[shared_ptr[CDataset]] result + shared_ptr[CFileSystem] c_filesystem + + if root_partition is None: + root_partition = _true + elif not isinstance(root_partition, Expression): + raise TypeError( + "Argument 'root_partition' has incorrect type (expected " + "Expression, got {0})".format(type(root_partition)) + ) + + for fragment in fragments: + c_fragments.push_back( + static_pointer_cast[CFileFragment, CFragment]( + fragment.unwrap())) + + if filesystem is None: + filesystem = fragment.filesystem + + if filesystem is not None: + c_filesystem = filesystem.unwrap() + + result = CFileSystemDataset.Make( + pyarrow_unwrap_schema(schema), + ( root_partition).unwrap(), + format.unwrap(), + c_filesystem, + c_fragments + ) + self.init(GetResultValue(result)) + + @property + def filesystem(self): + return FileSystem.wrap(self.filesystem_dataset.filesystem()) + + @property + def partitioning(self): + """ + The partitioning of the Dataset source, if discovered. + + If the FileSystemDataset is created using the ``dataset()`` factory + function with a partitioning specified, this will return the + finalized Partitioning object from the dataset discovery. In all + other cases, this returns None. + """ + c_partitioning = self.filesystem_dataset.partitioning() + if c_partitioning.get() == nullptr: + return None + try: + return Partitioning.wrap(c_partitioning) + except TypeError: + # e.g. type_name "default" + return None + + cdef void init(self, const shared_ptr[CDataset]& sp): + Dataset.init(self, sp) + self.filesystem_dataset = sp.get() + + def __reduce__(self): + return FileSystemDataset, ( + list(self.get_fragments()), + self.schema, + self.format, + self.filesystem, + self.partition_expression + ) + + @classmethod + def from_paths(cls, paths, schema=None, format=None, + filesystem=None, partitions=None, root_partition=None): + """ + A Dataset created from a list of paths on a particular filesystem. + + Parameters + ---------- + paths : list of str + List of file paths to create the fragments from. + schema : Schema + The top-level schema of the DataDataset. + format : FileFormat + File format to create fragments from, currently only + ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + The filesystem which files are from. + partitions : list[Expression], optional + Attach additional partition information for the file paths. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + if root_partition is None: + root_partition = _true + + for arg, class_, name in [ + (schema, Schema, 'schema'), + (format, FileFormat, 'format'), + (filesystem, FileSystem, 'filesystem'), + (root_partition, Expression, 'root_partition') + ]: + if not isinstance(arg, class_): + raise TypeError( + "Argument '{0}' has incorrect type (expected {1}, " + "got {2})".format(name, class_.__name__, type(arg)) + ) + + partitions = partitions or [_true] * len(paths) + + if len(paths) != len(partitions): + raise ValueError( + 'The number of files resulting from paths_or_selector ' + 'must be equal to the number of partitions.' + ) + + fragments = [ + format.make_fragment(path, filesystem, partitions[i]) + for i, path in enumerate(paths) + ] + return FileSystemDataset(fragments, schema, format, + filesystem, root_partition) + + @property + def files(self): + """List of the files""" + cdef vector[c_string] files = self.filesystem_dataset.files() + return [frombytes(f) for f in files] + + @property + def format(self): + """The FileFormat of this source.""" + return FileFormat.wrap(self.filesystem_dataset.format()) + + +cdef class FileWriteOptions(_Weakrefable): + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef void init(self, const shared_ptr[CFileWriteOptions]& sp): + self.wrapped = sp + self.c_options = sp.get() + + @staticmethod + cdef wrap(const shared_ptr[CFileWriteOptions]& sp): + type_name = frombytes(sp.get().type_name()) + + classes = { + 'csv': CsvFileWriteOptions, + 'ipc': IpcFileWriteOptions, + 'parquet': _get_parquet_symbol('ParquetFileWriteOptions'), + } + + class_ = classes.get(type_name, None) + if class_ is None: + raise TypeError(type_name) + + cdef FileWriteOptions self = class_.__new__(class_) + self.init(sp) + return self + + @property + def format(self): + return FileFormat.wrap(self.c_options.format()) + + cdef inline shared_ptr[CFileWriteOptions] unwrap(self): + return self.wrapped + + +cdef class FileFormat(_Weakrefable): + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef void init(self, const shared_ptr[CFileFormat]& sp): + self.wrapped = sp + self.format = sp.get() + + @staticmethod + cdef wrap(const shared_ptr[CFileFormat]& sp): + type_name = frombytes(sp.get().type_name()) + + classes = { + 'ipc': IpcFileFormat, + 'csv': CsvFileFormat, + 'json': JsonFileFormat, + 'parquet': _get_parquet_symbol('ParquetFileFormat'), + 'orc': _get_orc_fileformat(), + } + + class_ = classes.get(type_name, None) + if class_ is None: + raise TypeError(type_name) + + cdef FileFormat self = class_.__new__(class_) + self.init(sp) + return self + + cdef WrittenFile _finish_write(self, path, base_dir, + CFileWriter* file_writer): + parquet_metadata = None + size = GetResultValue(file_writer.GetBytesWritten()) + return WrittenFile(path, parquet_metadata, size) + + cdef inline shared_ptr[CFileFormat] unwrap(self): + return self.wrapped + + def inspect(self, file, filesystem=None): + """ + Infer the schema of a file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to infer a schema from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + + Returns + ------- + schema : Schema + The schema inferred from the file + """ + cdef: + CFileSource c_source = _make_file_source(file, filesystem, file_size=None) + CResult[shared_ptr[CSchema]] c_result + with nogil: + c_result = self.format.Inspect(c_source) + c_schema = GetResultValue(c_result) + return pyarrow_wrap_schema(move(c_schema)) + + def make_fragment(self, file, filesystem=None, + Expression partition_expression=None, + *, file_size=None): + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + if partition_expression is None: + partition_expression = _true + c_source = _make_file_source(file, filesystem, file_size) + c_fragment = GetResultValue( + self.format.MakeFragment(move(c_source), + partition_expression.unwrap(), + nullptr)) + return Fragment.wrap(move(c_fragment)) + + def make_write_options(self): + sp_write_options = self.format.DefaultWriteOptions() + if sp_write_options.get() == nullptr: + # DefaultWriteOptions() may return `nullptr` which means that + # the format does not yet support writing datasets. + raise NotImplementedError( + "Writing datasets not yet implemented for this file format." + ) + return FileWriteOptions.wrap(sp_write_options) + + @property + def default_extname(self): + return frombytes(self.format.type_name()) + + @property + def default_fragment_scan_options(self): + dfso = FragmentScanOptions.wrap( + self.wrapped.get().default_fragment_scan_options) + # CsvFileFormat stores a Python-specific encoding field that needs + # to be restored because it does not exist in the C++ struct + if isinstance(self, CsvFileFormat): + if self._read_options_py is not None: + dfso.read_options = self._read_options_py + return dfso + + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, FragmentScanOptions options): + if options is None: + self.wrapped.get().default_fragment_scan_options =\ + nullptr + else: + self._set_default_fragment_scan_options(options) + + cdef _set_default_fragment_scan_options(self, FragmentScanOptions options): + raise ValueError(f"Cannot set fragment scan options for " + f"'{options.type_name}' on {self.__class__.__name__}") + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False + + +cdef class Fragment(_Weakrefable): + """Fragment of data from a Dataset.""" + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef void init(self, const shared_ptr[CFragment]& sp): + self.wrapped = sp + self.fragment = sp.get() + + @staticmethod + cdef wrap(const shared_ptr[CFragment]& sp): + type_name = frombytes(sp.get().type_name()) + + classes = { + # IpcFileFormat, CsvFileFormat, JsonFileFormat and OrcFileFormat do not have + # corresponding subclasses of FileFragment + 'ipc': FileFragment, + 'csv': FileFragment, + 'json': FileFragment, + 'orc': FileFragment, + 'parquet': _get_parquet_symbol('ParquetFileFragment'), + } + + class_ = classes.get(type_name, None) + if class_ is None: + class_ = Fragment + + cdef Fragment self = class_.__new__(class_) + self.init(sp) + return self + + cdef inline shared_ptr[CFragment] unwrap(self): + return self.wrapped + + @property + def physical_schema(self): + """Return the physical schema of this Fragment. This schema can be + different from the dataset read schema.""" + cdef: + CResult[shared_ptr[CSchema]] maybe_schema + with nogil: + maybe_schema = self.fragment.ReadPhysicalSchema() + return pyarrow_wrap_schema(GetResultValue(maybe_schema)) + + @property + def partition_expression(self): + """An Expression which evaluates to true for all data viewed by this + Fragment. + """ + return Expression.wrap(self.fragment.partition_expression()) + + def scanner(self, + Schema schema=None, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Build a scan operation against the fragment. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + Parameters + ---------- + schema : Schema + Schema to use for scanning. This is used to unify a Fragment to + its Dataset's schema. If not specified this will use the + Fragment's physical schema which might differ for each Fragment. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + """ + return Scanner.from_fragment( + self, + schema=schema, + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ) + + def to_batches(self, + Schema schema=None, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Read the fragment as materialized record batches. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + return Scanner.from_fragment( + self, + schema=schema, + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).to_batches() + + def to_table(self, + Schema schema=None, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Convert this Fragment into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + return self.scanner( + schema=schema, + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).to_table() + + def take(self, + object indices, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + The indices of row to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + return self.scanner( + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).take(indices) + + def head(self, + int num_rows, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Load the first N rows of the fragment. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + return self.scanner( + columns=columns, + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).head(num_rows) + + def count_rows(self, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, + MemoryPool memory_pool=None): + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + return self.scanner( + filter=filter, + batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + fragment_scan_options=fragment_scan_options, + use_threads=use_threads, + memory_pool=memory_pool + ).count_rows() + + +cdef class FileFragment(Fragment): + """A Fragment representing a data file.""" + + cdef void init(self, const shared_ptr[CFragment]& sp): + Fragment.init(self, sp) + self.file_fragment = sp.get() + + def __repr__(self): + type_name = frombytes(self.fragment.type_name()) + if type_name != "parquet": + typ = f" type={type_name}" + else: + # parquet has a subclass -> type embedded in class name + typ = "" + partition_dict = get_partition_keys(self.partition_expression) + partition = ", ".join( + [f"{key}={val}" for key, val in partition_dict.items()] + ) + if partition: + partition = f" partition=[{partition}]" + return "".format( + self.__class__.__name__, typ, self.path, partition + ) + + def __reduce__(self): + buffer = self.buffer + return self.format.make_fragment, ( + self.path if buffer is None else buffer, + self.filesystem, + self.partition_expression + ) + + def open(self): + """ + Open a NativeFile of the buffer or file viewed by this fragment. + """ + cdef: + shared_ptr[CFileSystem] c_filesystem + shared_ptr[CRandomAccessFile] opened + c_string c_path + NativeFile out = NativeFile() + + if self.buffer is not None: + return pa.BufferReader(self.buffer) + + c_path = tobytes(self.file_fragment.source().path()) + with nogil: + c_filesystem = self.file_fragment.source().filesystem() + opened = GetResultValue(c_filesystem.get().OpenInputFile(c_path)) + + out.set_random_access_file(opened) + out.is_readable = True + return out + + @property + def path(self): + """ + The path of the data file viewed by this fragment, if it views a + file. If instead it views a buffer, this will be "". + """ + return frombytes(self.file_fragment.source().path()) + + @property + def filesystem(self): + """ + The FileSystem containing the data file viewed by this fragment, if + it views a file. If instead it views a buffer, this will be None. + """ + cdef: + shared_ptr[CFileSystem] c_fs + c_fs = self.file_fragment.source().filesystem() + + if c_fs.get() == nullptr: + return None + + return FileSystem.wrap(c_fs) + + @property + def buffer(self): + """ + The buffer viewed by this fragment, if it views a buffer. If + instead it views a file, this will be None. + """ + cdef: + shared_ptr[CBuffer] c_buffer + c_buffer = self.file_fragment.source().buffer() + + if c_buffer.get() == nullptr: + return None + + return pyarrow_wrap_buffer(c_buffer) + + @property + def format(self): + """ + The format of the data file viewed by this fragment. + """ + return FileFormat.wrap(self.file_fragment.format()) + + +cdef class FragmentScanOptions(_Weakrefable): + """Scan options specific to a particular fragment and scan operation.""" + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): + self.wrapped = sp + + @staticmethod + cdef wrap(const shared_ptr[CFragmentScanOptions]& sp): + if not sp: + return None + + type_name = frombytes(sp.get().type_name()) + + classes = { + 'csv': CsvFragmentScanOptions, + 'json': JsonFragmentScanOptions, + 'parquet': _get_parquet_symbol('ParquetFragmentScanOptions'), + } + + class_ = classes.get(type_name, None) + if class_ is None: + raise TypeError(type_name) + + cdef FragmentScanOptions self = class_.__new__(class_) + self.init(sp) + return self + + @property + def type_name(self): + return frombytes(self.wrapped.get().type_name()) + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False + + +cdef class IpcFileWriteOptions(FileWriteOptions): + cdef: + CIpcFileWriteOptions* ipc_options + + def __init__(self): + _forbid_instantiation(self.__class__) + + @property + def write_options(self): + out = IpcWriteOptions() + out.c_options = CIpcWriteOptions(deref(self.ipc_options.options)) + return out + + @write_options.setter + def write_options(self, IpcWriteOptions write_options not None): + self.ipc_options.options.reset( + new CIpcWriteOptions(write_options.c_options)) + + cdef void init(self, const shared_ptr[CFileWriteOptions]& sp): + FileWriteOptions.init(self, sp) + self.ipc_options = sp.get() + + +cdef class IpcFileFormat(FileFormat): + + def __init__(self): + self.init(shared_ptr[CFileFormat](new CIpcFileFormat())) + + def equals(self, IpcFileFormat other): + """ + Parameters + ---------- + other : pyarrow.dataset.IpcFileFormat + + Returns + ------- + True + """ + return True + + def make_write_options(self, **kwargs): + """ + Parameters + ---------- + **kwargs : dict + + Returns + ------- + pyarrow.ipc.IpcWriteOptions + """ + cdef IpcFileWriteOptions opts = \ + FileFormat.make_write_options(self) + opts.write_options = IpcWriteOptions(**kwargs) + return opts + + @property + def default_extname(self): + return "arrow" + + def __reduce__(self): + return IpcFileFormat, tuple() + + +cdef class FeatherFileFormat(IpcFileFormat): + + @property + def default_extname(self): + return "feather" + + +cdef class CsvFileFormat(FileFormat): + """ + FileFormat for CSV files. + + Parameters + ---------- + parse_options : pyarrow.csv.ParseOptions + Options regarding CSV parsing. + default_fragment_scan_options : CsvFragmentScanOptions + Default options for fragments scan. + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + cdef: + CCsvFileFormat* csv_format + # The encoding field in ReadOptions does not exist in the C++ struct. + # We need to store it here and override it when reading + # default_fragment_scan_options.read_options + public ReadOptions _read_options_py + + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, ParseOptions parse_options=None, + default_fragment_scan_options=None, + ConvertOptions convert_options=None, + ReadOptions read_options=None): + self.init(shared_ptr[CFileFormat](new CCsvFileFormat())) + if parse_options is not None: + self.parse_options = parse_options + if convert_options is not None or read_options is not None: + if default_fragment_scan_options: + raise ValueError('If `default_fragment_scan_options` is ' + 'given, cannot specify convert_options ' + 'or read_options') + self.default_fragment_scan_options = CsvFragmentScanOptions( + convert_options=convert_options, read_options=read_options) + elif isinstance(default_fragment_scan_options, dict): + self.default_fragment_scan_options = CsvFragmentScanOptions( + **default_fragment_scan_options) + elif isinstance(default_fragment_scan_options, CsvFragmentScanOptions): + self.default_fragment_scan_options = default_fragment_scan_options + elif default_fragment_scan_options is not None: + raise TypeError('`default_fragment_scan_options` must be either ' + 'a dictionary or an instance of ' + 'CsvFragmentScanOptions') + if read_options is not None: + self._read_options_py = read_options + + cdef void init(self, const shared_ptr[CFileFormat]& sp): + FileFormat.init(self, sp) + self.csv_format = sp.get() + + def make_write_options(self, **kwargs): + """ + Parameters + ---------- + **kwargs : dict + + Returns + ------- + pyarrow.csv.WriteOptions + """ + cdef CsvFileWriteOptions opts = \ + FileFormat.make_write_options(self) + opts.write_options = WriteOptions(**kwargs) + return opts + + @property + def parse_options(self): + return ParseOptions.wrap(self.csv_format.parse_options) + + @parse_options.setter + def parse_options(self, ParseOptions parse_options not None): + self.csv_format.parse_options = deref(parse_options.options) + + cdef _set_default_fragment_scan_options(self, FragmentScanOptions options): + if options.type_name == 'csv': + self.csv_format.default_fragment_scan_options = options.wrapped + self.default_fragment_scan_options.read_options = options.read_options + self._read_options_py = options.read_options + else: + super()._set_default_fragment_scan_options(options) + + def equals(self, CsvFileFormat other): + """ + Parameters + ---------- + other : pyarrow.dataset.CsvFileFormat + + Returns + ------- + bool + """ + return ( + self.parse_options.equals(other.parse_options) and + self.default_fragment_scan_options == + other.default_fragment_scan_options) + + def __reduce__(self): + return CsvFileFormat, (self.parse_options, + self.default_fragment_scan_options) + + def __repr__(self): + return f"" + + +cdef class CsvFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for CSV fragments. + + Parameters + ---------- + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + + cdef: + CCsvFragmentScanOptions* csv_options + # The encoding field in ReadOptions does not exist in the C++ struct. + # We need to store it here and override it when reading read_options + ReadOptions _read_options_py + + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, ConvertOptions convert_options=None, + ReadOptions read_options=None): + self.init(shared_ptr[CFragmentScanOptions]( + new CCsvFragmentScanOptions())) + if convert_options is not None: + self.convert_options = convert_options + if read_options is not None: + self.read_options = read_options + self._read_options_py = read_options + + cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): + FragmentScanOptions.init(self, sp) + self.csv_options = sp.get() + + @property + def convert_options(self): + return ConvertOptions.wrap(self.csv_options.convert_options) + + @convert_options.setter + def convert_options(self, ConvertOptions convert_options not None): + self.csv_options.convert_options = deref(convert_options.options) + + @property + def read_options(self): + read_options = ReadOptions.wrap(self.csv_options.read_options) + if self._read_options_py is not None: + read_options.encoding = self._read_options_py.encoding + return read_options + + @read_options.setter + def read_options(self, ReadOptions read_options not None): + self.csv_options.read_options = deref(read_options.options) + self._read_options_py = read_options + if codecs.lookup(read_options.encoding).name != 'utf-8': + self.csv_options.stream_transform_func = deref( + make_streamwrap_func(read_options.encoding, 'utf-8')) + + def equals(self, CsvFragmentScanOptions other): + """ + Parameters + ---------- + other : pyarrow.dataset.CsvFragmentScanOptions + + Returns + ------- + bool + """ + return ( + other and + self.convert_options.equals(other.convert_options) and + self.read_options.equals(other.read_options)) + + def __reduce__(self): + return CsvFragmentScanOptions, (self.convert_options, + self.read_options) + + +cdef class CsvFileWriteOptions(FileWriteOptions): + cdef: + CCsvFileWriteOptions* csv_options + object _properties + + def __init__(self): + _forbid_instantiation(self.__class__) + + @property + def write_options(self): + return WriteOptions.wrap(deref(self.csv_options.write_options)) + + @write_options.setter + def write_options(self, WriteOptions write_options not None): + self.csv_options.write_options.reset( + new CCSVWriteOptions(deref(write_options.options))) + + cdef void init(self, const shared_ptr[CFileWriteOptions]& sp): + FileWriteOptions.init(self, sp) + self.csv_options = sp.get() + + +cdef class JsonFileFormat(FileFormat): + """ + FileFormat for JSON files. + + Parameters + ---------- + default_fragment_scan_options : JsonFragmentScanOptions + Default options for fragments scan. + parse_options : pyarrow.json.ParseOptions + Options regarding json parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + cdef: + CJsonFileFormat* json_format + + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, default_fragment_scan_options=None, + JsonParseOptions parse_options=None, + JsonReadOptions read_options=None): + self.init(shared_ptr[CFileFormat](new CJsonFileFormat())) + if parse_options is not None or read_options is not None: + if default_fragment_scan_options is not None: + raise ValueError('If `default_fragment_scan_options` is ' + 'given, cannot specify read_options') + self.default_fragment_scan_options = JsonFragmentScanOptions( + parse_options=parse_options, + read_options=read_options) + elif isinstance(default_fragment_scan_options, dict): + self.default_fragment_scan_options = JsonFragmentScanOptions( + **default_fragment_scan_options) + elif isinstance(default_fragment_scan_options, JsonFragmentScanOptions): + self.default_fragment_scan_options = default_fragment_scan_options + elif default_fragment_scan_options is not None: + raise TypeError('`default_fragment_scan_options` must be either ' + 'a dictionary or an instance of ' + 'JsonFragmentScanOptions') + + cdef void init(self, const shared_ptr[CFileFormat]& sp): + FileFormat.init(self, sp) + self.json_format = sp.get() + + cdef _set_default_fragment_scan_options(self, FragmentScanOptions options): + if options.type_name == 'json': + self.json_format.default_fragment_scan_options = options.wrapped + self.default_fragment_scan_options.read_options = options.read_options + self.default_fragment_scan_options.parse_options = options.parse_options + else: + super()._set_default_fragment_scan_options(options) + + def equals(self, JsonFileFormat other): + """ + Parameters + ---------- + other : pyarrow.dataset.JsonFileFormat + + Returns + ------- + bool + """ + return (other and + self.default_fragment_scan_options == + other.default_fragment_scan_options) + + def __reduce__(self): + return JsonFileFormat, (self.default_fragment_scan_options,) + + def __repr__(self): + return "" + + +cdef class JsonFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for JSON fragments. + + Parameters + ---------- + parse_options : pyarrow.json.ParseOptions + Options regarding JSON parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + cdef: + CJsonFragmentScanOptions* json_options + + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, JsonParseOptions parse_options=None, + JsonReadOptions read_options=None): + self.init(shared_ptr[CFragmentScanOptions]( + new CJsonFragmentScanOptions())) + if parse_options is not None: + self.parse_options = parse_options + if read_options is not None: + self.read_options = read_options + + cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): + FragmentScanOptions.init(self, sp) + self.json_options = sp.get() + + @property + def parse_options(self): + return JsonParseOptions.wrap(self.json_options.parse_options) + + @parse_options.setter + def parse_options(self, JsonParseOptions parse_options not None): + self.json_options.parse_options = parse_options.options + + @property + def read_options(self): + return JsonReadOptions.wrap(self.json_options.read_options) + + @read_options.setter + def read_options(self, JsonReadOptions read_options not None): + self.json_options.read_options = read_options.options + + def equals(self, JsonFragmentScanOptions other): + """ + Parameters + ---------- + other : pyarrow.dataset.JsonFragmentScanOptions + + Returns + ------- + bool + """ + return ( + other and + self.read_options.equals(other.read_options) and + self.parse_options.equals(other.parse_options)) + + def __reduce__(self): + return JsonFragmentScanOptions, (self.parse_options, self.read_options) + + +cdef class Partitioning(_Weakrefable): + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef init(self, const shared_ptr[CPartitioning]& sp): + self.wrapped = sp + self.partitioning = sp.get() + + @staticmethod + cdef wrap(const shared_ptr[CPartitioning]& sp): + type_name = frombytes(sp.get().type_name()) + + classes = { + 'directory': DirectoryPartitioning, + 'hive': HivePartitioning, + 'filename': FilenamePartitioning, + } + + class_ = classes.get(type_name, None) + if class_ is None: + raise TypeError(type_name) + + cdef Partitioning self = class_.__new__(class_) + self.init(sp) + return self + + cdef inline shared_ptr[CPartitioning] unwrap(self): + return self.wrapped + + def __eq__(self, other): + if isinstance(other, Partitioning): + return self.partitioning.Equals(deref((other).unwrap())) + return False + + def parse(self, path): + """ + Parse a path into a partition expression. + + Parameters + ---------- + path : str + + Returns + ------- + pyarrow.dataset.Expression + """ + cdef CResult[CExpression] result + result = self.partitioning.Parse(tobytes(path)) + return Expression.wrap(GetResultValue(result)) + + def format(self, expr): + """ + Convert a filter expression into a tuple of (directory, filename) using + the current partitioning scheme + + Parameters + ---------- + expr : pyarrow.dataset.Expression + + Returns + ------- + tuple[str, str] + + Examples + -------- + + Specify the Schema for paths like "/2009/June": + + >>> import pyarrow as pa + >>> import pyarrow.dataset as ds + >>> import pyarrow.compute as pc + >>> part = ds.partitioning(pa.schema([("year", pa.int16()), + ... ("month", pa.string())])) + >>> part.format( + ... (pc.field("year") == 1862) & (pc.field("month") == "Jan") + ... ) + ('1862/Jan', '') + """ + cdef: + CPartitionPathFormat result + + result = GetResultValue(self.partitioning.Format( + Expression.unwrap(expr) + )) + + return frombytes(result.directory), frombytes(result.filename) + + @property + def schema(self): + """The arrow Schema attached to the partitioning.""" + return pyarrow_wrap_schema(self.partitioning.schema()) + + +cdef class PartitioningFactory(_Weakrefable): + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef init(self, const shared_ptr[CPartitioningFactory]& sp): + self.wrapped = sp + self.factory = sp.get() + + @staticmethod + cdef wrap(const shared_ptr[CPartitioningFactory]& sp, + object constructor, object options): + cdef PartitioningFactory self = PartitioningFactory.__new__( + PartitioningFactory + ) + self.init(sp) + self.constructor = constructor + self.options = options + return self + + cdef inline shared_ptr[CPartitioningFactory] unwrap(self): + return self.wrapped + + def __reduce__(self): + return self.constructor, self.options + + @property + def type_name(self): + return frombytes(self.factory.type_name()) + + +cdef vector[shared_ptr[CArray]] _partitioning_dictionaries( + Schema schema, dictionaries) except *: + cdef: + vector[shared_ptr[CArray]] c_dictionaries + + dictionaries = dictionaries or {} + + for field in schema: + dictionary = dictionaries.get(field.name) + + if (isinstance(field.type, pa.DictionaryType) and + dictionary is not None): + c_dictionaries.push_back(pyarrow_unwrap_array(dictionary)) + else: + c_dictionaries.push_back( nullptr) + + return c_dictionaries + + +cdef class KeyValuePartitioning(Partitioning): + + cdef: + CKeyValuePartitioning* keyvalue_partitioning + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef init(self, const shared_ptr[CPartitioning]& sp): + Partitioning.init(self, sp) + self.keyvalue_partitioning = sp.get() + self.wrapped = sp + self.partitioning = sp.get() + + def __reduce__(self): + dictionaries = self.dictionaries + if dictionaries: + dictionaries = dict(zip(self.schema.names, dictionaries)) + segment_encoding = _wrap_segment_encoding( + deref(self.keyvalue_partitioning).segment_encoding() + ) + return self.__class__, (self.schema, dictionaries, segment_encoding) + + @property + def dictionaries(self): + """ + The unique values for each partition field, if available. + + Those values are only available if the Partitioning object was + created through dataset discovery from a PartitioningFactory, or + if the dictionaries were manually specified in the constructor. + If no dictionary field is available, this returns an empty list. + """ + cdef vector[shared_ptr[CArray]] c_arrays + c_arrays = self.keyvalue_partitioning.dictionaries() + res = [] + for arr in c_arrays: + if arr.get() == nullptr: + # Partitioning object has not been created through + # inspected Factory + res.append(None) + else: + res.append(pyarrow_wrap_array(arr)) + return res + + +def _constructor_directory_partitioning_factory(*args): + return DirectoryPartitioning.discover(*args) + + +cdef class DirectoryPartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The DirectoryPartitioning expects one segment in the file path for each + field in the schema (all fields are required to be present). + For example given schema the path "/2009/11" would + be parsed to ("year"_ == 2009 and "month"_ == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + DirectoryPartitioning + + Examples + -------- + >>> from pyarrow.dataset import DirectoryPartitioning + >>> partitioning = DirectoryPartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())])) + >>> print(partitioning.parse("/2009/11/")) + ((year == 2009) and (month == 11)) + """ + + cdef: + CDirectoryPartitioning* directory_partitioning + + def __init__(self, Schema schema not None, dictionaries=None, + segment_encoding="uri"): + cdef: + shared_ptr[CDirectoryPartitioning] c_partitioning + CKeyValuePartitioningOptions c_options + + c_options.segment_encoding = _get_segment_encoding(segment_encoding) + c_partitioning = make_shared[CDirectoryPartitioning]( + pyarrow_unwrap_schema(schema), + _partitioning_dictionaries(schema, dictionaries), + c_options, + ) + self.init( c_partitioning) + + cdef init(self, const shared_ptr[CPartitioning]& sp): + KeyValuePartitioning.init(self, sp) + self.directory_partitioning = sp.get() + + @staticmethod + def discover(field_names=None, infer_dictionary=False, + max_partition_dictionary_size=0, + schema=None, segment_encoding="uri"): + """ + Discover a DirectoryPartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + cdef: + CPartitioningFactoryOptions c_options + vector[c_string] c_field_names + + if max_partition_dictionary_size in {-1, None}: + infer_dictionary = True + elif max_partition_dictionary_size != 0: + raise NotImplementedError("max_partition_dictionary_size must be " + "0, -1, or None") + + if infer_dictionary: + c_options.infer_dictionary = True + + if schema: + c_options.schema = pyarrow_unwrap_schema(schema) + c_field_names = [tobytes(f.name) for f in schema] + elif not field_names: + raise ValueError( + "Neither field_names nor schema was passed; " + "cannot infer field_names") + else: + c_field_names = [tobytes(s) for s in field_names] + + c_options.segment_encoding = _get_segment_encoding(segment_encoding) + + return PartitioningFactory.wrap( + CDirectoryPartitioning.MakeFactory(c_field_names, c_options), + _constructor_directory_partitioning_factory, + (field_names, infer_dictionary, max_partition_dictionary_size, + schema, segment_encoding) + ) + + +def _constructor_hive_partitioning_factory(*args): + return HivePartitioning.discover(*args) + + +cdef class HivePartitioning(KeyValuePartitioning): + """ + A Partitioning for "/$key=$value/" nested directories as found in + Apache Hive. + + Multi-level, directory based partitioning scheme originating from + Apache Hive with all data files stored in the leaf directories. Data is + partitioned by static values of a particular column in the schema. + Partition keys are represented in the form $key=$value in directory names. + Field order is ignored, as are missing or unrecognized field names. + + For example, given schema, a possible + path would be "/year=2009/month=11/day=15". + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + If any field is None then this fallback will be used as a label + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + HivePartitioning + + Examples + -------- + >>> from pyarrow.dataset import HivePartitioning + >>> partitioning = HivePartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())])) + >>> print(partitioning.parse("/year=2009/month=11/")) + ((year == 2009) and (month == 11)) + + """ + + cdef: + CHivePartitioning* hive_partitioning + + def __init__(self, + Schema schema not None, + dictionaries=None, + null_fallback="__HIVE_DEFAULT_PARTITION__", + segment_encoding="uri"): + + cdef: + shared_ptr[CHivePartitioning] c_partitioning + CHivePartitioningOptions c_options + + c_options.null_fallback = tobytes(null_fallback) + c_options.segment_encoding = _get_segment_encoding(segment_encoding) + + c_partitioning = make_shared[CHivePartitioning]( + pyarrow_unwrap_schema(schema), + _partitioning_dictionaries(schema, dictionaries), + c_options, + ) + self.init( c_partitioning) + + cdef init(self, const shared_ptr[CPartitioning]& sp): + KeyValuePartitioning.init(self, sp) + self.hive_partitioning = sp.get() + + def __reduce__(self): + dictionaries = self.dictionaries + if dictionaries: + dictionaries = dict(zip(self.schema.names, dictionaries)) + segment_encoding = _wrap_segment_encoding( + deref(self.keyvalue_partitioning).segment_encoding() + ) + null_fallback = frombytes(deref(self.hive_partitioning).null_fallback()) + return HivePartitioning, ( + self.schema, dictionaries, null_fallback, segment_encoding + ) + + @staticmethod + def discover(infer_dictionary=False, + max_partition_dictionary_size=0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema=None, + segment_encoding="uri"): + """ + Discover a HivePartitioning. + + Parameters + ---------- + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain. This can be more efficient when + materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + When inferring a schema for partition fields this value will be + replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ + for compatibility with Spark + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + cdef: + CHivePartitioningFactoryOptions c_options + + if max_partition_dictionary_size in {-1, None}: + infer_dictionary = True + elif max_partition_dictionary_size != 0: + raise NotImplementedError("max_partition_dictionary_size must be " + "0, -1, or None") + + if infer_dictionary: + c_options.infer_dictionary = True + + c_options.null_fallback = tobytes(null_fallback) + + if schema: + c_options.schema = pyarrow_unwrap_schema(schema) + + c_options.segment_encoding = _get_segment_encoding(segment_encoding) + + return PartitioningFactory.wrap( + CHivePartitioning.MakeFactory(c_options), + _constructor_hive_partitioning_factory, + (infer_dictionary, max_partition_dictionary_size, null_fallback, + schema, segment_encoding), + ) + + +def _constructor_filename_partitioning_factory(*args): + return FilenamePartitioning.discover(*args) + + +cdef class FilenamePartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The FilenamePartitioning expects one segment in the file name for each + field in the schema (all fields are required to be present) separated + by '_'. For example given schema the name + ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + FilenamePartitioning + + Examples + -------- + >>> from pyarrow.dataset import FilenamePartitioning + >>> partitioning = FilenamePartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())])) + >>> print(partitioning.parse("2009_11_data.parquet")) + ((year == 2009) and (month == 11)) + """ + + cdef: + CFilenamePartitioning* filename_partitioning + + def __init__(self, Schema schema not None, dictionaries=None, + segment_encoding="uri"): + cdef: + shared_ptr[CFilenamePartitioning] c_partitioning + CKeyValuePartitioningOptions c_options + + c_options.segment_encoding = _get_segment_encoding(segment_encoding) + c_partitioning = make_shared[CFilenamePartitioning]( + pyarrow_unwrap_schema(schema), + _partitioning_dictionaries(schema, dictionaries), + c_options, + ) + self.init( c_partitioning) + + cdef init(self, const shared_ptr[CPartitioning]& sp): + KeyValuePartitioning.init(self, sp) + self.filename_partitioning = sp.get() + + @staticmethod + def discover(field_names=None, infer_dictionary=False, + schema=None, segment_encoding="uri"): + """ + Discover a FilenamePartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + cdef: + CPartitioningFactoryOptions c_options + vector[c_string] c_field_names + + if infer_dictionary: + c_options.infer_dictionary = True + + if schema: + c_options.schema = pyarrow_unwrap_schema(schema) + c_field_names = [tobytes(f.name) for f in schema] + elif not field_names: + raise TypeError( + "Neither field_names nor schema was passed; " + "cannot infer field_names") + else: + c_field_names = [tobytes(s) for s in field_names] + + c_options.segment_encoding = _get_segment_encoding(segment_encoding) + + return PartitioningFactory.wrap( + CFilenamePartitioning.MakeFactory(c_field_names, c_options), + _constructor_filename_partitioning_factory, + (field_names, infer_dictionary, schema, segment_encoding) + ) + + +cdef class DatasetFactory(_Weakrefable): + """ + DatasetFactory is used to create a Dataset, inspect the Schema + of the fragments contained in it, and declare a partitioning. + """ + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef init(self, const shared_ptr[CDatasetFactory]& sp): + self.wrapped = sp + self.factory = sp.get() + + @staticmethod + cdef wrap(const shared_ptr[CDatasetFactory]& sp): + cdef DatasetFactory self = \ + DatasetFactory.__new__(DatasetFactory) + self.init(sp) + return self + + cdef inline shared_ptr[CDatasetFactory] unwrap(self) nogil: + return self.wrapped + + @property + def root_partition(self): + return Expression.wrap(self.factory.root_partition()) + + @root_partition.setter + def root_partition(self, Expression expr): + check_status(self.factory.SetRootPartition(expr.unwrap())) + + def inspect_schemas(self): + cdef CResult[vector[shared_ptr[CSchema]]] result + cdef CInspectOptions options + with nogil: + result = self.factory.InspectSchemas(options) + + schemas = [] + for s in GetResultValue(result): + schemas.append(pyarrow_wrap_schema(s)) + return schemas + + def inspect(self): + """ + Inspect all data fragments and return a common Schema. + + Returns + ------- + Schema + """ + cdef: + CInspectOptions options + CResult[shared_ptr[CSchema]] result + with nogil: + result = self.factory.Inspect(options) + return pyarrow_wrap_schema(GetResultValue(result)) + + def finish(self, Schema schema=None): + """ + Create a Dataset using the inspected schema or an explicit schema + (if given). + + Parameters + ---------- + schema : Schema, default None + The schema to conform the source to. If None, the inspected + schema is used. + + Returns + ------- + Dataset + """ + cdef: + shared_ptr[CSchema] sp_schema + CResult[shared_ptr[CDataset]] result + + if schema is not None: + sp_schema = pyarrow_unwrap_schema(schema) + with nogil: + result = self.factory.FinishWithSchema(sp_schema) + else: + with nogil: + result = self.factory.Finish() + + return Dataset.wrap(GetResultValue(result)) + + +cdef class FileSystemFactoryOptions(_Weakrefable): + """ + Influences the discovery of filesystem paths. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning/PartitioningFactory, optional + Apply the Partitioning to every discovered Fragment. See Partitioning or + PartitioningFactory documentation. + exclude_invalid_files : bool, optional (default True) + If True, invalid files will be excluded (file format specific check). + This will incur IO for each files in a serial and single threaded + fashion. Disabling this feature will skip the IO, but unsupported + files may be present in the Dataset (resulting in an error at scan + time). + selector_ignore_prefixes : list, optional + When discovering from a Selector (and not from an explicit file list), + ignore files and directories matching any of these prefixes. + By default this is ['.', '_']. + """ + + cdef: + CFileSystemFactoryOptions options + + __slots__ = () # avoid mistakingly creating attributes + + def __init__(self, partition_base_dir=None, partitioning=None, + exclude_invalid_files=None, + list selector_ignore_prefixes=None): + if isinstance(partitioning, PartitioningFactory): + self.partitioning_factory = partitioning + elif isinstance(partitioning, Partitioning): + self.partitioning = partitioning + + if partition_base_dir is not None: + self.partition_base_dir = partition_base_dir + if exclude_invalid_files is not None: + self.exclude_invalid_files = exclude_invalid_files + if selector_ignore_prefixes is not None: + self.selector_ignore_prefixes = selector_ignore_prefixes + + cdef inline CFileSystemFactoryOptions unwrap(self): + return self.options + + @property + def partitioning(self): + """Partitioning to apply to discovered files. + + NOTE: setting this property will overwrite partitioning_factory. + """ + c_partitioning = self.options.partitioning.partitioning() + if c_partitioning.get() == nullptr: + return None + return Partitioning.wrap(c_partitioning) + + @partitioning.setter + def partitioning(self, Partitioning value): + self.options.partitioning = ( value).unwrap() + + @property + def partitioning_factory(self): + """PartitioningFactory to apply to discovered files and + discover a Partitioning. + + NOTE: setting this property will overwrite partitioning. + """ + c_factory = self.options.partitioning.factory() + if c_factory.get() == nullptr: + return None + return PartitioningFactory.wrap(c_factory, None, None) + + @partitioning_factory.setter + def partitioning_factory(self, PartitioningFactory value): + self.options.partitioning = ( value).unwrap() + + @property + def partition_base_dir(self): + """ + Base directory to strip paths before applying the partitioning. + """ + return frombytes(self.options.partition_base_dir) + + @partition_base_dir.setter + def partition_base_dir(self, value): + self.options.partition_base_dir = tobytes(value) + + @property + def exclude_invalid_files(self): + """Whether to exclude invalid files.""" + return self.options.exclude_invalid_files + + @exclude_invalid_files.setter + def exclude_invalid_files(self, bint value): + self.options.exclude_invalid_files = value + + @property + def selector_ignore_prefixes(self): + """ + List of prefixes. Files matching one of those prefixes will be + ignored by the discovery process. + """ + return [frombytes(p) for p in self.options.selector_ignore_prefixes] + + @selector_ignore_prefixes.setter + def selector_ignore_prefixes(self, values): + self.options.selector_ignore_prefixes = [tobytes(v) for v in values] + + +cdef vector[CFileInfo] unwrap_finfos(finfos): + cdef vector[CFileInfo] o_vect + for fi in finfos: + o_vect.push_back(( fi).unwrap()) + return o_vect + + +cdef class FileSystemDatasetFactory(DatasetFactory): + """ + Create a DatasetFactory from a list of paths with schema inspection. + + Parameters + ---------- + filesystem : pyarrow.fs.FileSystem + Filesystem to discover. + paths_or_selector : pyarrow.fs.FileSelector or list of path-likes + Either a Selector object or a list of path-like objects. + format : FileFormat + Currently only ParquetFileFormat and IpcFileFormat are supported. + options : FileSystemFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + + cdef: + CFileSystemDatasetFactory* filesystem_factory + + def __init__(self, FileSystem filesystem not None, paths_or_selector, + FileFormat format not None, + FileSystemFactoryOptions options=None): + cdef: + vector[c_string] paths + vector[CFileInfo] finfos + CFileSelector c_selector + CResult[shared_ptr[CDatasetFactory]] result + shared_ptr[CFileSystem] c_filesystem + shared_ptr[CFileFormat] c_format + CFileSystemFactoryOptions c_options + + options = options or FileSystemFactoryOptions() + c_options = options.unwrap() + c_filesystem = filesystem.unwrap() + c_format = format.unwrap() + + if isinstance(paths_or_selector, FileSelector): + with nogil: + c_selector = ( paths_or_selector).selector + result = CFileSystemDatasetFactory.MakeFromSelector( + c_filesystem, + c_selector, + c_format, + c_options + ) + elif isinstance(paths_or_selector, (list, tuple)): + if len(paths_or_selector) > 0 and isinstance(paths_or_selector[0], FileInfo): + finfos = unwrap_finfos(paths_or_selector) + with nogil: + result = CFileSystemDatasetFactory.MakeFromFileInfos( + c_filesystem, + finfos, + c_format, + c_options + ) + else: + paths = [tobytes(s) for s in paths_or_selector] + with nogil: + result = CFileSystemDatasetFactory.MakeFromPaths( + c_filesystem, + paths, + c_format, + c_options + ) + else: + raise TypeError('Must pass either paths or a FileSelector, but ' + 'passed {}'.format(type(paths_or_selector))) + + self.init(GetResultValue(result)) + + cdef init(self, shared_ptr[CDatasetFactory]& sp): + DatasetFactory.init(self, sp) + self.filesystem_factory = sp.get() + + +cdef class UnionDatasetFactory(DatasetFactory): + """ + Provides a way to inspect/discover a Dataset's expected schema before + materialization. + + Parameters + ---------- + factories : list of DatasetFactory + """ + + cdef: + CUnionDatasetFactory* union_factory + + def __init__(self, list factories): + cdef: + DatasetFactory factory + vector[shared_ptr[CDatasetFactory]] c_factories + for factory in factories: + c_factories.push_back(factory.unwrap()) + self.init(GetResultValue(CUnionDatasetFactory.Make(c_factories))) + + cdef init(self, const shared_ptr[CDatasetFactory]& sp): + DatasetFactory.init(self, sp) + self.union_factory = sp.get() + + +cdef class RecordBatchIterator(_Weakrefable): + """An iterator over a sequence of record batches.""" + cdef: + # An object that must be kept alive with the iterator. + object iterator_owner + # Iterator is a non-POD type and Cython uses offsetof, leading + # to a compiler warning unless wrapped like so + SharedPtrNoGIL[CRecordBatchIterator] iterator + + def __init__(self): + _forbid_instantiation(self.__class__, subclasses_instead=False) + + @staticmethod + cdef wrap(object owner, CRecordBatchIterator iterator): + cdef RecordBatchIterator self = \ + RecordBatchIterator.__new__(RecordBatchIterator) + self.iterator_owner = owner + self.iterator = make_shared[CRecordBatchIterator](move(iterator)) + return self + + cdef inline shared_ptr[CRecordBatchIterator] unwrap(self) nogil: + return self.iterator + + def __iter__(self): + return self + + def __next__(self): + cdef shared_ptr[CRecordBatch] record_batch + with nogil: + record_batch = GetResultValue(move(self.iterator.get().Next())) + if record_batch == NULL: + raise StopIteration + return pyarrow_wrap_batch(record_batch) + + +class TaggedRecordBatch(collections.namedtuple( + "TaggedRecordBatch", ["record_batch", "fragment"])): + """ + A combination of a record batch and the fragment it came from. + + Parameters + ---------- + record_batch : RecordBatch + The record batch. + fragment : Fragment + Fragment of the record batch. + """ + + +cdef class TaggedRecordBatchIterator(_Weakrefable): + """An iterator over a sequence of record batches with fragments.""" + cdef: + object iterator_owner + SharedPtrNoGIL[CTaggedRecordBatchIterator] iterator + + def __init__(self): + _forbid_instantiation(self.__class__, subclasses_instead=False) + + @staticmethod + cdef wrap(object owner, CTaggedRecordBatchIterator iterator): + cdef TaggedRecordBatchIterator self = \ + TaggedRecordBatchIterator.__new__(TaggedRecordBatchIterator) + self.iterator_owner = owner + self.iterator = make_shared[CTaggedRecordBatchIterator]( + move(iterator)) + return self + + def __iter__(self): + return self + + def __next__(self): + cdef CTaggedRecordBatch batch + with nogil: + batch = GetResultValue(move(self.iterator.get().Next())) + if batch.record_batch == NULL: + raise StopIteration + return TaggedRecordBatch( + record_batch=pyarrow_wrap_batch(batch.record_batch), + fragment=Fragment.wrap(batch.fragment)) + + +cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr, + object columns=None, Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + bint use_threads=True, MemoryPool memory_pool=None, + FragmentScanOptions fragment_scan_options=None)\ + except *: + cdef: + CScannerBuilder *builder + vector[CExpression] c_exprs + + builder = ptr.get() + + check_status(builder.Filter(_bind( + filter, pyarrow_wrap_schema(builder.schema())))) + + if columns is not None: + if isinstance(columns, dict): + for expr in columns.values(): + if not isinstance(expr, Expression): + raise TypeError( + "Expected an Expression for a 'column' dictionary " + "value, got {} instead".format(type(expr)) + ) + c_exprs.push_back(( expr).unwrap()) + + check_status( + builder.Project(c_exprs, [tobytes(c) for c in columns.keys()]) + ) + elif isinstance(columns, list): + check_status(builder.ProjectColumns([tobytes(c) for c in columns])) + else: + raise ValueError( + "Expected a list or a dict for 'columns', " + "got {} instead.".format(type(columns)) + ) + + check_status(builder.BatchSize(batch_size)) + check_status(builder.BatchReadahead(batch_readahead)) + check_status(builder.FragmentReadahead(fragment_readahead)) + check_status(builder.UseThreads(use_threads)) + check_status(builder.Pool(maybe_unbox_memory_pool(memory_pool))) + if fragment_scan_options: + check_status( + builder.FragmentScanOptions(fragment_scan_options.wrapped)) + + +cdef class Scanner(_Weakrefable): + """A materialized scan operation with context and options bound. + + A scanner is the class that glues the scan tasks, data fragments and data + sources together. + """ + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef void init(self, const shared_ptr[CScanner]& sp): + self.wrapped = sp + self.scanner = sp.get() + + @staticmethod + cdef wrap(const shared_ptr[CScanner]& sp): + cdef Scanner self = Scanner.__new__(Scanner) + self.init(sp) + return self + + cdef inline shared_ptr[CScanner] unwrap(self): + return self.wrapped + + @staticmethod + cdef shared_ptr[CScanOptions] _make_scan_options(Dataset dataset, dict py_scanoptions) except *: + cdef: + shared_ptr[CScannerBuilder] builder = make_shared[CScannerBuilder](dataset.unwrap()) + + py_scanoptions = dataset._scanner_options(py_scanoptions) + + # Need to explicitly expand the arguments as Cython doesn't support + # keyword expansion in cdef functions. + _populate_builder( + builder, + columns=py_scanoptions.get("columns"), + filter=py_scanoptions.get("filter"), + batch_size=py_scanoptions.get("batch_size", _DEFAULT_BATCH_SIZE), + batch_readahead=py_scanoptions.get( + "batch_readahead", _DEFAULT_BATCH_READAHEAD), + fragment_readahead=py_scanoptions.get( + "fragment_readahead", _DEFAULT_FRAGMENT_READAHEAD), + use_threads=py_scanoptions.get("use_threads", True), + memory_pool=py_scanoptions.get("memory_pool"), + fragment_scan_options=py_scanoptions.get("fragment_scan_options")) + + return GetResultValue(deref(builder).GetScanOptions()) + + @staticmethod + def from_dataset(Dataset dataset not None, *, + object columns=None, + Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, MemoryPool memory_pool=None): + """ + Create Scanner from Dataset, + + Parameters + ---------- + dataset : Dataset + Dataset to scan. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + cdef: + shared_ptr[CScanOptions] options + shared_ptr[CScannerBuilder] builder + shared_ptr[CScanner] scanner + + options = Scanner._make_scan_options( + dataset, + dict(columns=columns, filter=filter, batch_size=batch_size, + batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, use_threads=use_threads, + memory_pool=memory_pool, fragment_scan_options=fragment_scan_options) + ) + builder = make_shared[CScannerBuilder](dataset.unwrap(), options) + scanner = GetResultValue(builder.get().Finish()) + return Scanner.wrap(scanner) + + @staticmethod + def from_fragment(Fragment fragment not None, *, Schema schema=None, + object columns=None, Expression filter=None, + int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, MemoryPool memory_pool=None): + """ + Create Scanner from Fragment, + + Parameters + ---------- + fragment : Fragment + fragment to scan. + schema : Schema, optional + The schema of the fragment. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + cdef: + shared_ptr[CScanOptions] options = make_shared[CScanOptions]() + shared_ptr[CScannerBuilder] builder + shared_ptr[CScanner] scanner + + schema = schema or fragment.physical_schema + + builder = make_shared[CScannerBuilder](pyarrow_unwrap_schema(schema), + fragment.unwrap(), options) + _populate_builder(builder, columns=columns, filter=filter, + batch_size=batch_size, batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, + use_threads=use_threads, + memory_pool=memory_pool, + fragment_scan_options=fragment_scan_options) + + scanner = GetResultValue(builder.get().Finish()) + return Scanner.wrap(scanner) + + @staticmethod + def from_batches(source, *, Schema schema=None, object columns=None, + Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + FragmentScanOptions fragment_scan_options=None, + bint use_threads=True, MemoryPool memory_pool=None): + """ + Create a Scanner from an iterator of batches. + + This creates a scanner which can be used only once. It is + intended to support writing a dataset (which takes a scanner) + from a source which can be read only once (e.g. a + RecordBatchReader or generator). + + Parameters + ---------- + source : Iterator + The iterator of Batches. + schema : Schema + The schema of the batches. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + cdef: + shared_ptr[CScannerBuilder] builder + shared_ptr[CScanner] scanner + RecordBatchReader reader + if isinstance(source, pa.ipc.RecordBatchReader): + if schema: + raise ValueError('Cannot specify a schema when providing ' + 'a RecordBatchReader') + reader = source + elif _is_iterable(source): + if schema is None: + raise ValueError('Must provide schema to construct scanner ' + 'from an iterable') + reader = pa.ipc.RecordBatchReader.from_batches(schema, source) + else: + raise TypeError('Expected a RecordBatchReader or an iterable of ' + 'batches instead of the given type: ' + + type(source).__name__) + builder = CScannerBuilder.FromRecordBatchReader(reader.reader) + _populate_builder(builder, columns=columns, filter=filter, + batch_size=batch_size, batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, use_threads=use_threads, + memory_pool=memory_pool, + fragment_scan_options=fragment_scan_options) + scanner = GetResultValue(builder.get().Finish()) + return Scanner.wrap(scanner) + + @property + def dataset_schema(self): + """The schema with which batches will be read from fragments.""" + return pyarrow_wrap_schema( + self.scanner.options().get().dataset_schema) + + @property + def projected_schema(self): + """ + The materialized schema of the data, accounting for projections. + + This is the schema of any data returned from the scanner. + """ + return pyarrow_wrap_schema( + self.scanner.options().get().projected_schema) + + def to_batches(self): + """ + Consume a Scanner in record batches. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def _iterator(batch_iter): + for batch in batch_iter: + yield batch.record_batch + # Don't make ourselves a generator so errors are raised immediately + return _iterator(self.scan_batches()) + + def scan_batches(self): + """ + Consume a Scanner in record batches with corresponding fragments. + + Returns + ------- + record_batches : iterator of TaggedRecordBatch + """ + cdef CTaggedRecordBatchIterator iterator + with nogil: + iterator = move(GetResultValue(self.scanner.ScanBatches())) + # Don't make ourselves a generator so errors are raised immediately + return TaggedRecordBatchIterator.wrap(self, move(iterator)) + + def to_table(self): + """ + Convert a Scanner into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Returns + ------- + Table + """ + cdef CResult[shared_ptr[CTable]] result + + with nogil: + result = self.scanner.ToTable() + + return pyarrow_wrap_table(GetResultValue(result)) + + def take(self, object indices): + """ + Select rows of data by index. + + Will only consume as many batches of the underlying dataset as + needed. Otherwise, this is equivalent to + ``to_table().take(indices)``. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + + Returns + ------- + Table + """ + cdef CResult[shared_ptr[CTable]] result + cdef shared_ptr[CArray] c_indices + + if not isinstance(indices, pa.Array): + indices = pa.array(indices) + c_indices = pyarrow_unwrap_array(indices) + + with nogil: + result = self.scanner.TakeRows(deref(c_indices)) + return pyarrow_wrap_table(GetResultValue(result)) + + def head(self, int num_rows): + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + + Returns + ------- + Table + """ + cdef CResult[shared_ptr[CTable]] result + with nogil: + result = self.scanner.Head(num_rows) + return pyarrow_wrap_table(GetResultValue(result)) + + def count_rows(self): + """ + Count rows matching the scanner filter. + + Returns + ------- + count : int + """ + cdef CResult[int64_t] result + with nogil: + result = self.scanner.CountRows() + return GetResultValue(result) + + def to_reader(self): + """Consume this scanner as a RecordBatchReader. + + Returns + ------- + RecordBatchReader + """ + cdef RecordBatchReader reader + reader = RecordBatchReader.__new__(RecordBatchReader) + reader.reader = GetResultValue(self.scanner.ToRecordBatchReader()) + return reader + + +def get_partition_keys(Expression partition_expression): + """ + Extract partition keys (equality constraints between a field and a scalar) + from an expression as a dict mapping the field's name to its value. + + NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning + will be conjunctions of equality conditions and are accessible through this + function. Other subexpressions will be ignored. + + Parameters + ---------- + partition_expression : pyarrow.dataset.Expression + + Returns + ------- + dict + + Examples + -------- + + For example, an expression of + + is converted to {'part': 'A', 'year': 2016} + """ + cdef: + CExpression expr = partition_expression.unwrap() + pair[CFieldRef, CDatum] ref_val + + out = {} + for ref_val in GetResultValue(CExtractKnownFieldValues(expr)).map: + assert ref_val.first.name() != nullptr + assert ref_val.second.kind() == DatumType_SCALAR + val = pyarrow_wrap_scalar(ref_val.second.scalar()) + out[frombytes(deref(ref_val.first.name()))] = val.as_py() + return out + + +cdef class WrittenFile(_Weakrefable): + """ + Metadata information about files written as + part of a dataset write operation + + Parameters + ---------- + path : str + Path to the file. + metadata : pyarrow.parquet.FileMetaData, optional + For Parquet files, the Parquet file metadata. + size : int + The size of the file in bytes. + """ + + def __init__(self, path, metadata, size): + self.path = path + self.metadata = metadata + self.size = size + + +cdef void _filesystemdataset_write_visitor( + dict visit_args, + CFileWriter* file_writer): + cdef: + str path + str base_dir + WrittenFile written_file + FileFormat file_format + + path = frombytes(deref(file_writer).destination().path) + base_dir = frombytes(visit_args['base_dir']) + file_format = FileFormat.wrap(file_writer.format()) + written_file = file_format._finish_write(path, base_dir, file_writer) + visit_args['file_visitor'](written_file) + + +def _filesystemdataset_write( + Scanner data not None, + object base_dir not None, + str basename_template not None, + FileSystem filesystem not None, + Partitioning partitioning not None, + FileWriteOptions file_options not None, + int max_partitions, + object file_visitor, + str existing_data_behavior not None, + int max_open_files, + int max_rows_per_file, + int min_rows_per_group, + int max_rows_per_group, + bool create_dir +): + """ + CFileSystemDataset.Write wrapper + """ + cdef: + CFileSystemDatasetWriteOptions c_options + shared_ptr[CScanner] c_scanner + dict visit_args + + c_options.file_write_options = file_options.unwrap() + c_options.filesystem = filesystem.unwrap() + c_options.base_dir = tobytes(_stringify_path(base_dir)) + c_options.partitioning = partitioning.unwrap() + c_options.max_partitions = max_partitions + c_options.max_open_files = max_open_files + c_options.max_rows_per_file = max_rows_per_file + c_options.max_rows_per_group = max_rows_per_group + c_options.min_rows_per_group = min_rows_per_group + c_options.basename_template = tobytes(basename_template) + if existing_data_behavior == 'error': + c_options.existing_data_behavior = ExistingDataBehavior_ERROR + elif existing_data_behavior == 'overwrite_or_ignore': + c_options.existing_data_behavior =\ + ExistingDataBehavior_OVERWRITE_OR_IGNORE + elif existing_data_behavior == 'delete_matching': + c_options.existing_data_behavior = ExistingDataBehavior_DELETE_MATCHING + else: + raise ValueError( + ("existing_data_behavior must be one of 'error', ", + "'overwrite_or_ignore' or 'delete_matching'") + ) + c_options.create_dir = create_dir + + if file_visitor is not None: + visit_args = {'base_dir': c_options.base_dir, + 'file_visitor': file_visitor} + # Need to use post_finish because parquet metadata is not available + # until after Finish has been called + c_options.writer_post_finish = BindFunction[cb_writer_finish_internal]( + &_filesystemdataset_write_visitor, visit_args) + + c_scanner = data.unwrap() + with nogil: + check_status(CFileSystemDataset.Write(c_options, c_scanner)) + + +cdef class _ScanNodeOptions(ExecNodeOptions): + + def _set_options(self, Dataset dataset, dict scan_options): + cdef: + shared_ptr[CScanOptions] c_scan_options + + c_scan_options = Scanner._make_scan_options(dataset, scan_options) + + self.wrapped.reset( + new CScanNodeOptions(dataset.unwrap(), c_scan_options) + ) + + +class ScanNodeOptions(_ScanNodeOptions): + """ + A Source node which yields batches from a Dataset scan. + + This is the option class for the "scan" node factory. + + This node is capable of applying pushdown projections or filters + to the file readers which reduce the amount of data that needs to + be read (if supported by the file format). But note that this does not + construct associated filter or project nodes to perform the final + filtering or projection. Rather, you may supply the same filter + expression or projection to the scan node that you also supply + to the filter or project node. + + Yielded batches will be augmented with fragment/batch indices to + enable stable ordering for simple ExecPlans. + + Parameters + ---------- + dataset : pyarrow.dataset.Dataset + The table which acts as the data source. + **kwargs : dict, optional + Scan options. See `Scanner.from_dataset` for possible arguments. + """ + + def __init__(self, Dataset dataset, **kwargs): + self._set_options(dataset, kwargs) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_orc.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_orc.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..3aaae1e097d1799cd85af486e6a380ab6b1e4348 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_orc.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_orc.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_orc.pyx new file mode 100644 index 0000000000000000000000000000000000000000..a8cce3362225adcfd7e70b51e521f26d43d9a102 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_orc.pyx @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset support for ORC file format.""" + +from pyarrow.lib cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_dataset cimport * + +from pyarrow._dataset cimport FileFormat + + +cdef class OrcFileFormat(FileFormat): + + def __init__(self): + self.init(shared_ptr[CFileFormat](new COrcFileFormat())) + + def equals(self, OrcFileFormat other): + """ + Parameters + ---------- + other : pyarrow.dataset.OrcFileFormat + + Returns + ------- + True + """ + return True + + @property + def default_extname(self): + return "orc" + + def __reduce__(self): + return OrcFileFormat, tuple() diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..d40b9d60fcddd499992abcf6752853130c244af6 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.pxd new file mode 100644 index 0000000000000000000000000000000000000000..0a3a2ff526ea48d7160763791ec3531a3b249bca --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.pxd @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset support for Parquet file format.""" + +from pyarrow.includes.libarrow_dataset cimport * +from pyarrow.includes.libarrow_dataset_parquet cimport * + +from pyarrow._dataset cimport FragmentScanOptions, FileWriteOptions + + +cdef class ParquetFragmentScanOptions(FragmentScanOptions): + cdef: + CParquetFragmentScanOptions* parquet_options + object _parquet_decryption_config + object _decryption_properties + + cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp) + cdef CReaderProperties* reader_properties(self) + cdef ArrowReaderProperties* arrow_reader_properties(self) + + +cdef class ParquetFileWriteOptions(FileWriteOptions): + + cdef: + CParquetFileWriteOptions* parquet_options + object _properties diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.pyx new file mode 100644 index 0000000000000000000000000000000000000000..8fe9f30d33af9bc5cbf7cb25978334292f5ae9dc --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet.pyx @@ -0,0 +1,1053 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset support for Parquet file format.""" + +from cython cimport binding +from cython.operator cimport dereference as deref + +import os +import warnings + +import pyarrow as pa +from pyarrow.lib cimport * +from pyarrow.lib import frombytes, tobytes, is_threading_enabled +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_dataset cimport * +from pyarrow.includes.libarrow_dataset_parquet cimport * +from pyarrow._fs cimport FileSystem + +from pyarrow._compute cimport Expression, _bind +from pyarrow._dataset cimport ( + _make_file_source, + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + CacheOptions, + Partitioning, + PartitioningFactory, + WrittenFile +) + +from pyarrow._parquet cimport ( + _create_writer_properties, _create_arrow_writer_properties, + FileMetaData, +) + + +try: + from pyarrow._dataset_parquet_encryption import ( + set_encryption_config, set_decryption_config, set_decryption_properties + ) + parquet_encryption_enabled = True +except ImportError: + parquet_encryption_enabled = False + + +cdef Expression _true = Expression._scalar(True) + +ctypedef CParquetFileWriter* _CParquetFileWriterPtr + + +cdef class ParquetFileFormat(FileFormat): + """ + FileFormat for Parquet + + Parameters + ---------- + read_options : ParquetReadOptions + Read options for the file. + default_fragment_scan_options : ParquetFragmentScanOptions + Scan Options for the file. + **kwargs : dict + Additional options for read option or scan option + """ + + cdef: + CParquetFileFormat* parquet_format + + def __init__(self, read_options=None, + default_fragment_scan_options=None, + **kwargs): + cdef: + shared_ptr[CParquetFileFormat] wrapped + CParquetFileFormatReaderOptions* options + + # Read/scan options + read_options_args = {option: kwargs[option] for option in kwargs + if option in _PARQUET_READ_OPTIONS} + scan_args = {option: kwargs[option] for option in kwargs + if option not in _PARQUET_READ_OPTIONS} + if read_options and read_options_args: + duplicates = ', '.join(sorted(read_options_args)) + raise ValueError(f'If `read_options` is given, ' + f'cannot specify {duplicates}') + if default_fragment_scan_options and scan_args: + duplicates = ', '.join(sorted(scan_args)) + raise ValueError(f'If `default_fragment_scan_options` is given, ' + f'cannot specify {duplicates}') + + if read_options is None: + read_options = ParquetReadOptions(**read_options_args) + elif isinstance(read_options, dict): + # For backwards compatibility + duplicates = [] + for option, value in read_options.items(): + if option in _PARQUET_READ_OPTIONS: + read_options_args[option] = value + else: + duplicates.append(option) + scan_args[option] = value + if duplicates: + duplicates = ", ".join(duplicates) + warnings.warn(f'The scan options {duplicates} should be ' + 'specified directly as keyword arguments') + read_options = ParquetReadOptions(**read_options_args) + elif not isinstance(read_options, ParquetReadOptions): + raise TypeError('`read_options` must be either a dictionary or an ' + 'instance of ParquetReadOptions') + + if default_fragment_scan_options is None: + default_fragment_scan_options = ParquetFragmentScanOptions(**scan_args) + elif isinstance(default_fragment_scan_options, dict): + default_fragment_scan_options = ParquetFragmentScanOptions( + **default_fragment_scan_options) + elif not isinstance(default_fragment_scan_options, + ParquetFragmentScanOptions): + raise TypeError('`default_fragment_scan_options` must be either a ' + 'dictionary or an instance of ' + 'ParquetFragmentScanOptions') + + wrapped = make_shared[CParquetFileFormat]() + + options = &(wrapped.get().reader_options) + if read_options.dictionary_columns is not None: + for column in read_options.dictionary_columns: + options.dict_columns.insert(tobytes(column)) + options.coerce_int96_timestamp_unit = \ + read_options._coerce_int96_timestamp_unit + + self.init( wrapped) + self.default_fragment_scan_options = default_fragment_scan_options + + cdef void init(self, const shared_ptr[CFileFormat]& sp): + FileFormat.init(self, sp) + self.parquet_format = sp.get() + + cdef WrittenFile _finish_write(self, path, base_dir, + CFileWriter* file_writer): + cdef: + FileMetaData parquet_metadata + CParquetFileWriter* parquet_file_writer + + parquet_metadata = None + parquet_file_writer = dynamic_cast[_CParquetFileWriterPtr](file_writer) + with nogil: + metadata = deref( + deref(parquet_file_writer).parquet_writer()).metadata() + if metadata: + parquet_metadata = FileMetaData() + parquet_metadata.init(metadata) + parquet_metadata.set_file_path(os.path.relpath(path, base_dir)) + + size = GetResultValue(file_writer.GetBytesWritten()) + + return WrittenFile(path, parquet_metadata, size) + + @property + def read_options(self): + cdef CParquetFileFormatReaderOptions* options + options = &self.parquet_format.reader_options + parquet_read_options = ParquetReadOptions( + dictionary_columns={frombytes(col) + for col in options.dict_columns}, + ) + # Read options getter/setter works with strings so setting + # the private property which uses the C Type + parquet_read_options._coerce_int96_timestamp_unit = \ + options.coerce_int96_timestamp_unit + return parquet_read_options + + def make_write_options(self, **kwargs): + """ + Parameters + ---------- + **kwargs : dict + + Returns + ------- + pyarrow.dataset.FileWriteOptions + """ + # Safeguard from calling make_write_options as a static class method + if not isinstance(self, ParquetFileFormat): + raise TypeError("make_write_options() should be called on " + "an instance of ParquetFileFormat") + opts = FileFormat.make_write_options(self) + ( opts).update(**kwargs) + return opts + + cdef _set_default_fragment_scan_options(self, FragmentScanOptions options): + if options.type_name == 'parquet': + self.parquet_format.default_fragment_scan_options = options.wrapped + else: + super()._set_default_fragment_scan_options(options) + + def equals(self, ParquetFileFormat other): + """ + Parameters + ---------- + other : pyarrow.dataset.ParquetFileFormat + + Returns + ------- + bool + """ + return ( + self.read_options.equals(other.read_options) and + self.default_fragment_scan_options == + other.default_fragment_scan_options + ) + + @property + def default_extname(self): + return "parquet" + + def __reduce__(self): + return ParquetFileFormat, (self.read_options, + self.default_fragment_scan_options) + + def __repr__(self): + return f"" + + def make_fragment(self, file, filesystem=None, + Expression partition_expression=None, row_groups=None, *, file_size=None): + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + row_groups : Iterable, optional + The indices of the row groups to include + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + cdef: + vector[int] c_row_groups + if partition_expression is None: + partition_expression = _true + if row_groups is None: + return super().make_fragment(file, filesystem, + partition_expression, file_size=file_size) + + c_source = _make_file_source(file, filesystem, file_size) + c_row_groups = [ row_group for row_group in set(row_groups)] + + c_fragment = GetResultValue( + self.parquet_format.MakeFragment(move(c_source), + partition_expression.unwrap(), + nullptr, + move(c_row_groups))) + return Fragment.wrap(move(c_fragment)) + + +class RowGroupInfo: + """ + A wrapper class for RowGroup information + + Parameters + ---------- + id : integer + The group ID. + metadata : FileMetaData + The rowgroup metadata. + schema : Schema + Schema of the rows. + """ + + def __init__(self, id, metadata, schema): + self.id = id + self.metadata = metadata + self.schema = schema + + @property + def num_rows(self): + return self.metadata.num_rows + + @property + def total_byte_size(self): + return self.metadata.total_byte_size + + @property + def statistics(self): + def name_stats(i): + col = self.metadata.column(i) + + stats = col.statistics + if stats is None or not stats.has_min_max: + return None, None + + name = col.path_in_schema + field_index = self.schema.get_field_index(name) + if field_index < 0: + return None, None + + typ = self.schema.field(field_index).type + return col.path_in_schema, { + 'min': pa.scalar(stats.min, type=typ).as_py(), + 'max': pa.scalar(stats.max, type=typ).as_py() + } + + return { + name: stats for name, stats + in map(name_stats, range(self.metadata.num_columns)) + if stats is not None + } + + def __repr__(self): + return "RowGroupInfo({})".format(self.id) + + def __eq__(self, other): + if isinstance(other, int): + return self.id == other + if not isinstance(other, RowGroupInfo): + return False + return self.id == other.id + + +cdef class ParquetFileFragment(FileFragment): + """A Fragment representing a parquet file.""" + + cdef: + CParquetFileFragment* parquet_file_fragment + + cdef void init(self, const shared_ptr[CFragment]& sp): + FileFragment.init(self, sp) + self.parquet_file_fragment = sp.get() + + def __reduce__(self): + buffer = self.buffer + # parquet_file_fragment.row_groups() is empty if the metadata + # information of the file is not yet populated + if not bool(self.parquet_file_fragment.row_groups()): + row_groups = None + else: + row_groups = [row_group.id for row_group in self.row_groups] + + return self.format.make_fragment, ( + self.path if buffer is None else buffer, + self.filesystem, + self.partition_expression, + row_groups + ) + + def ensure_complete_metadata(self): + """ + Ensure that all metadata (statistics, physical schema, ...) have + been read and cached in this fragment. + """ + with nogil: + check_status(self.parquet_file_fragment.EnsureCompleteMetadata()) + + @property + def row_groups(self): + metadata = self.metadata + cdef vector[int] row_groups = self.parquet_file_fragment.row_groups() + return [RowGroupInfo(i, metadata.row_group(i), self.physical_schema) + for i in row_groups] + + @property + def metadata(self): + self.ensure_complete_metadata() + cdef FileMetaData metadata = FileMetaData() + metadata.init(self.parquet_file_fragment.metadata()) + return metadata + + @property + def num_row_groups(self): + """ + Return the number of row groups viewed by this fragment (not the + number of row groups in the origin file). + """ + self.ensure_complete_metadata() + return self.parquet_file_fragment.row_groups().size() + + def split_by_row_group(self, Expression filter=None, + Schema schema=None): + """ + Split the fragment into multiple fragments. + + Yield a Fragment wrapping each row group in this ParquetFileFragment. + Row groups will be excluded whose metadata contradicts the optional + filter. + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + + Returns + ------- + A list of Fragments + """ + cdef: + vector[shared_ptr[CFragment]] c_fragments + CExpression c_filter + shared_ptr[CFragment] c_fragment + + schema = schema or self.physical_schema + c_filter = _bind(filter, schema) + with nogil: + c_fragments = move(GetResultValue( + self.parquet_file_fragment.SplitByRowGroup(move(c_filter)))) + + return [Fragment.wrap(c_fragment) for c_fragment in c_fragments] + + def subset(self, Expression filter=None, Schema schema=None, + object row_group_ids=None): + """ + Create a subset of the fragment (viewing a subset of the row groups). + + Subset can be specified by either a filter predicate (with optional + schema) or by a list of row group IDs. Note that when using a filter, + the resulting fragment can be empty (viewing no row groups). + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + row_group_ids : list of ints + The row group IDs to include in the subset. Can only be specified + if `filter` is None. + + Returns + ------- + ParquetFileFragment + """ + cdef: + CExpression c_filter + vector[int] c_row_group_ids + shared_ptr[CFragment] c_fragment + + if filter is not None and row_group_ids is not None: + raise ValueError( + "Cannot specify both 'filter' and 'row_group_ids'." + ) + + if filter is not None: + schema = schema or self.physical_schema + c_filter = _bind(filter, schema) + with nogil: + c_fragment = move(GetResultValue( + self.parquet_file_fragment.SubsetWithFilter( + move(c_filter)))) + elif row_group_ids is not None: + c_row_group_ids = [ + row_group for row_group in sorted(set(row_group_ids)) + ] + with nogil: + c_fragment = move(GetResultValue( + self.parquet_file_fragment.SubsetWithIds( + move(c_row_group_ids)))) + else: + raise ValueError( + "Need to specify one of 'filter' or 'row_group_ids'" + ) + + return Fragment.wrap(c_fragment) + + +cdef class ParquetReadOptions(_Weakrefable): + """ + Parquet format specific options for reading. + + Parameters + ---------- + dictionary_columns : list of string, default None + Names of columns which should be dictionary encoded as + they are read + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds + """ + + cdef public: + set dictionary_columns + TimeUnit _coerce_int96_timestamp_unit + + # Also see _PARQUET_READ_OPTIONS + def __init__(self, dictionary_columns=None, + coerce_int96_timestamp_unit=None): + self.dictionary_columns = set(dictionary_columns or set()) + self.coerce_int96_timestamp_unit = coerce_int96_timestamp_unit + + @property + def coerce_int96_timestamp_unit(self): + return timeunit_to_string(self._coerce_int96_timestamp_unit) + + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit): + if unit is not None: + self._coerce_int96_timestamp_unit = string_to_timeunit(unit) + else: + self._coerce_int96_timestamp_unit = TimeUnit_NANO + + def equals(self, ParquetReadOptions other): + """ + Parameters + ---------- + other : pyarrow.dataset.ParquetReadOptions + + Returns + ------- + bool + """ + return (self.dictionary_columns == other.dictionary_columns and + self.coerce_int96_timestamp_unit == + other.coerce_int96_timestamp_unit) + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False + + def __repr__(self): + return ( + f"" + ) + + +cdef class ParquetFileWriteOptions(FileWriteOptions): + + def update(self, **kwargs): + """ + Parameters + ---------- + **kwargs : dict + """ + arrow_fields = { + "use_deprecated_int96_timestamps", + "coerce_timestamps", + "allow_truncated_timestamps", + "use_compliant_nested_type", + } + + setters = set() + for name, value in kwargs.items(): + if name not in self._properties: + raise TypeError("unexpected parquet write option: " + name) + self._properties[name] = value + if name in arrow_fields: + setters.add(self._set_arrow_properties) + elif name == "encryption_config" and value is not None: + setters.add(self._set_encryption_config) + else: + setters.add(self._set_properties) + + for setter in setters: + setter() + + def _set_properties(self): + cdef CParquetFileWriteOptions* opts = self.parquet_options + + opts.writer_properties = _create_writer_properties( + use_dictionary=self._properties["use_dictionary"], + compression=self._properties["compression"], + version=self._properties["version"], + write_statistics=self._properties["write_statistics"], + data_page_size=self._properties["data_page_size"], + compression_level=self._properties["compression_level"], + use_byte_stream_split=( + self._properties["use_byte_stream_split"] + ), + column_encoding=self._properties["column_encoding"], + data_page_version=self._properties["data_page_version"], + encryption_properties=self._properties["encryption_properties"], + write_batch_size=self._properties["write_batch_size"], + dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"], + write_page_index=self._properties["write_page_index"], + write_page_checksum=self._properties["write_page_checksum"], + sorting_columns=self._properties["sorting_columns"], + store_decimal_as_integer=self._properties["store_decimal_as_integer"], + ) + + def _set_arrow_properties(self): + cdef CParquetFileWriteOptions* opts = self.parquet_options + + opts.arrow_writer_properties = _create_arrow_writer_properties( + use_deprecated_int96_timestamps=( + self._properties["use_deprecated_int96_timestamps"] + ), + coerce_timestamps=self._properties["coerce_timestamps"], + allow_truncated_timestamps=( + self._properties["allow_truncated_timestamps"] + ), + writer_engine_version="V2", + use_compliant_nested_type=( + self._properties["use_compliant_nested_type"] + ) + ) + + def _set_encryption_config(self): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Encryption is not enabled in your installation of pyarrow, but an " + "encryption_config was provided." + ) + set_encryption_config(self, self._properties["encryption_config"]) + + cdef void init(self, const shared_ptr[CFileWriteOptions]& sp): + FileWriteOptions.init(self, sp) + self.parquet_options = sp.get() + self._properties = dict( + use_dictionary=True, + compression="snappy", + version="2.6", + write_statistics=None, + data_page_size=None, + compression_level=None, + use_byte_stream_split=False, + column_encoding=None, + data_page_version="1.0", + use_deprecated_int96_timestamps=False, + coerce_timestamps=None, + allow_truncated_timestamps=False, + use_compliant_nested_type=True, + encryption_properties=None, + write_batch_size=None, + dictionary_pagesize_limit=None, + write_page_index=False, + encryption_config=None, + write_page_checksum=False, + sorting_columns=None, + store_decimal_as_integer=False, + ) + + self._set_properties() + self._set_arrow_properties() + + def __repr__(self): + return "".format( + " ".join([f"{key}={value}" for key, value in self._properties.items()]) + ) + + +cdef set _PARQUET_READ_OPTIONS = { + 'dictionary_columns', 'coerce_int96_timestamp_unit' +} + + +cdef class ParquetFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for Parquet fragments. + + Parameters + ---------- + use_buffered_stream : bool, default False + Read files through buffered input streams rather than loading entire + row groups at once. This may be enabled to reduce memory overhead. + Disabled by default. + buffer_size : int, default 8192 + Size of buffered stream, if enabled. Default is 8KB. + pre_buffer : bool, default True + If enabled, pre-buffer the raw Parquet data instead of issuing one + read per column chunk. This can improve performance on high-latency + filesystems (e.g. S3, GCS) by coalescing and issuing file reads in + parallel using a background I/O thread pool. + Set to False if you want to prioritize minimal memory usage + over maximum speed. + cache_options : pyarrow.CacheOptions, default None + Cache options used when pre_buffer is enabled. The default values should + be good for most use cases. You may want to adjust these for example if + you have exceptionally high latency to the file system. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None + If not None, use the provided ParquetDecryptionConfig to decrypt the + Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + """ + + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, *, bint use_buffered_stream=False, + buffer_size=8192, + bint pre_buffer=True, + cache_options=None, + thrift_string_size_limit=None, + thrift_container_size_limit=None, + decryption_config=None, + decryption_properties=None, + bint page_checksum_verification=False): + self.init(shared_ptr[CFragmentScanOptions]( + new CParquetFragmentScanOptions())) + self.use_buffered_stream = use_buffered_stream + self.buffer_size = buffer_size + if pre_buffer and not is_threading_enabled(): + pre_buffer = False + self.pre_buffer = pre_buffer + if cache_options is not None: + self.cache_options = cache_options + if thrift_string_size_limit is not None: + self.thrift_string_size_limit = thrift_string_size_limit + if thrift_container_size_limit is not None: + self.thrift_container_size_limit = thrift_container_size_limit + if decryption_config is not None: + self.parquet_decryption_config = decryption_config + if decryption_properties is not None: + self.decryption_properties = decryption_properties + self.page_checksum_verification = page_checksum_verification + + cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): + FragmentScanOptions.init(self, sp) + self.parquet_options = sp.get() + + cdef CReaderProperties* reader_properties(self): + return self.parquet_options.reader_properties.get() + + cdef ArrowReaderProperties* arrow_reader_properties(self): + return self.parquet_options.arrow_reader_properties.get() + + @property + def use_buffered_stream(self): + return self.reader_properties().is_buffered_stream_enabled() + + @use_buffered_stream.setter + def use_buffered_stream(self, bint use_buffered_stream): + if use_buffered_stream: + self.reader_properties().enable_buffered_stream() + else: + self.reader_properties().disable_buffered_stream() + + @property + def buffer_size(self): + return self.reader_properties().buffer_size() + + @buffer_size.setter + def buffer_size(self, buffer_size): + if buffer_size <= 0: + raise ValueError("Buffer size must be larger than zero") + self.reader_properties().set_buffer_size(buffer_size) + + @property + def pre_buffer(self): + return self.arrow_reader_properties().pre_buffer() + + @pre_buffer.setter + def pre_buffer(self, bint pre_buffer): + if pre_buffer and not is_threading_enabled(): + return + self.arrow_reader_properties().set_pre_buffer(pre_buffer) + + @property + def cache_options(self): + return CacheOptions.wrap(self.arrow_reader_properties().cache_options()) + + @cache_options.setter + def cache_options(self, CacheOptions options): + self.arrow_reader_properties().set_cache_options(options.unwrap()) + + @property + def thrift_string_size_limit(self): + return self.reader_properties().thrift_string_size_limit() + + @thrift_string_size_limit.setter + def thrift_string_size_limit(self, size): + if size <= 0: + raise ValueError("size must be larger than zero") + self.reader_properties().set_thrift_string_size_limit(size) + + @property + def thrift_container_size_limit(self): + return self.reader_properties().thrift_container_size_limit() + + @thrift_container_size_limit.setter + def thrift_container_size_limit(self, size): + if size <= 0: + raise ValueError("size must be larger than zero") + self.reader_properties().set_thrift_container_size_limit(size) + + @property + def decryption_properties(self): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Unable to access encryption features. " + "Encryption is not enabled in your installation of pyarrow." + ) + return self._decryption_properties + + @decryption_properties.setter + def decryption_properties(self, config): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Encryption is not enabled in your installation of pyarrow, but " + "decryption_properties were provided." + ) + set_decryption_properties(self, config) + self._decryption_properties = config + + @property + def parquet_decryption_config(self): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Unable to access encryption features. " + "Encryption is not enabled in your installation of pyarrow." + ) + return self._parquet_decryption_config + + @parquet_decryption_config.setter + def parquet_decryption_config(self, config): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Encryption is not enabled in your installation of pyarrow, but a " + "decryption_config was provided." + ) + set_decryption_config(self, config) + self._parquet_decryption_config = config + + @property + def page_checksum_verification(self): + return self.reader_properties().page_checksum_verification() + + @page_checksum_verification.setter + def page_checksum_verification(self, bint page_checksum_verification): + self.reader_properties().set_page_checksum_verification(page_checksum_verification) + + def equals(self, ParquetFragmentScanOptions other): + """ + Parameters + ---------- + other : pyarrow.dataset.ParquetFragmentScanOptions + + Returns + ------- + bool + """ + attrs = ( + self.use_buffered_stream, self.buffer_size, self.pre_buffer, self.cache_options, + self.thrift_string_size_limit, self.thrift_container_size_limit, + self.page_checksum_verification) + other_attrs = ( + other.use_buffered_stream, other.buffer_size, other.pre_buffer, other.cache_options, + other.thrift_string_size_limit, + other.thrift_container_size_limit, other.page_checksum_verification) + return attrs == other_attrs + + @staticmethod + @binding(True) # Required for Cython < 3 + def _reconstruct(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + return ParquetFragmentScanOptions(**kwargs) + + def __reduce__(self): + kwargs = dict( + use_buffered_stream=self.use_buffered_stream, + buffer_size=self.buffer_size, + pre_buffer=self.pre_buffer, + cache_options=self.cache_options, + thrift_string_size_limit=self.thrift_string_size_limit, + thrift_container_size_limit=self.thrift_container_size_limit, + page_checksum_verification=self.page_checksum_verification + ) + return ParquetFragmentScanOptions._reconstruct, (kwargs,) + + +cdef class ParquetFactoryOptions(_Weakrefable): + """ + Influences the discovery of parquet dataset. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning, PartitioningFactory, optional + The partitioning scheme applied to fragments, see ``Partitioning``. + validate_column_chunk_paths : bool, default False + Assert that all ColumnChunk paths are consistent. The parquet spec + allows for ColumnChunk data to be stored in multiple files, but + ParquetDatasetFactory supports only a single file with all ColumnChunk + data. If this flag is set construction of a ParquetDatasetFactory will + raise an error if ColumnChunk data is not resident in a single file. + """ + + cdef: + CParquetFactoryOptions options + + __slots__ = () # avoid mistakingly creating attributes + + def __init__(self, partition_base_dir=None, partitioning=None, + validate_column_chunk_paths=False): + if isinstance(partitioning, PartitioningFactory): + self.partitioning_factory = partitioning + elif isinstance(partitioning, Partitioning): + self.partitioning = partitioning + + if partition_base_dir is not None: + self.partition_base_dir = partition_base_dir + + self.options.validate_column_chunk_paths = validate_column_chunk_paths + + cdef inline CParquetFactoryOptions unwrap(self): + return self.options + + @property + def partitioning(self): + """Partitioning to apply to discovered files. + + NOTE: setting this property will overwrite partitioning_factory. + """ + c_partitioning = self.options.partitioning.partitioning() + if c_partitioning.get() == nullptr: + return None + return Partitioning.wrap(c_partitioning) + + @partitioning.setter + def partitioning(self, Partitioning value): + self.options.partitioning = ( value).unwrap() + + @property + def partitioning_factory(self): + """PartitioningFactory to apply to discovered files and + discover a Partitioning. + + NOTE: setting this property will overwrite partitioning. + """ + c_factory = self.options.partitioning.factory() + if c_factory.get() == nullptr: + return None + return PartitioningFactory.wrap(c_factory, None, None) + + @partitioning_factory.setter + def partitioning_factory(self, PartitioningFactory value): + self.options.partitioning = ( value).unwrap() + + @property + def partition_base_dir(self): + """ + Base directory to strip paths before applying the partitioning. + """ + return frombytes(self.options.partition_base_dir) + + @partition_base_dir.setter + def partition_base_dir(self, value): + self.options.partition_base_dir = tobytes(value) + + @property + def validate_column_chunk_paths(self): + """ + Base directory to strip paths before applying the partitioning. + """ + return self.options.validate_column_chunk_paths + + @validate_column_chunk_paths.setter + def validate_column_chunk_paths(self, value): + self.options.validate_column_chunk_paths = value + + +cdef class ParquetDatasetFactory(DatasetFactory): + """ + Create a ParquetDatasetFactory from a Parquet `_metadata` file. + + Parameters + ---------- + metadata_path : str + Path to the `_metadata` parquet metadata-only file generated with + `pyarrow.parquet.write_metadata`. + filesystem : pyarrow.fs.FileSystem + Filesystem to read the metadata_path from, and subsequent parquet + files. + format : ParquetFileFormat + Parquet format options. + options : ParquetFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + + cdef: + CParquetDatasetFactory* parquet_factory + + def __init__(self, metadata_path, FileSystem filesystem not None, + FileFormat format not None, + ParquetFactoryOptions options=None): + cdef: + c_string c_path + shared_ptr[CFileSystem] c_filesystem + shared_ptr[CParquetFileFormat] c_format + CResult[shared_ptr[CDatasetFactory]] result + CParquetFactoryOptions c_options + + c_path = tobytes(metadata_path) + c_filesystem = filesystem.unwrap() + c_format = static_pointer_cast[CParquetFileFormat, CFileFormat]( + format.unwrap()) + options = options or ParquetFactoryOptions() + c_options = options.unwrap() + + with nogil: + result = CParquetDatasetFactory.MakeFromMetaDataPath( + c_path, c_filesystem, c_format, c_options) + self.init(GetResultValue(result)) + + cdef init(self, shared_ptr[CDatasetFactory]& sp): + DatasetFactory.init(self, sp) + self.parquet_factory = sp.get() diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet_encryption.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet_encryption.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..f160668a3d94fb15e3cedd8a4460ab9404039684 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet_encryption.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet_encryption.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet_encryption.pyx new file mode 100644 index 0000000000000000000000000000000000000000..c8f5e5b01bf81f32d641d70341fe74bf6bfbbc80 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dataset_parquet_encryption.pyx @@ -0,0 +1,178 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset support for Parquet encryption.""" + +from pyarrow.includes.libarrow_dataset_parquet cimport * +from pyarrow._parquet_encryption cimport * +from pyarrow._dataset_parquet cimport ParquetFragmentScanOptions, ParquetFileWriteOptions + + +cdef class ParquetEncryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level encryption + within the Parquet framework. + + The ParquetEncryptionConfig class serves as a bridge for passing encryption-related + parameters to the appropriate components within the Parquet library. It maintains references + to objects that define the encryption strategy, Key Management Service (KMS) configuration, + and specific encryption configurations for Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for + creating cryptographic components, such as encryptors and decryptors. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration + parameters necessary for connecting to a Key Management Service (KMS). + encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration + Shared pointer to an `EncryptionConfiguration` object. This object defines specific + encryption settings for Parquet data, including the keys assigned to different columns. + + Raises + ------ + ValueError + Raised if `encryption_config` is None. + """ + cdef: + shared_ptr[CParquetEncryptionConfig] c_config + + # Avoid mistakenly creating attributes + __slots__ = () + + def __cinit__(self, CryptoFactory crypto_factory, KmsConnectionConfig kms_connection_config, + EncryptionConfiguration encryption_config): + + cdef shared_ptr[CEncryptionConfiguration] c_encryption_config + + if crypto_factory is None: + raise ValueError("crypto_factory cannot be None") + + if kms_connection_config is None: + raise ValueError("kms_connection_config cannot be None") + + if encryption_config is None: + raise ValueError("encryption_config cannot be None") + + self.c_config.reset(new CParquetEncryptionConfig()) + + c_encryption_config = pyarrow_unwrap_encryptionconfig( + encryption_config) + + self.c_config.get().crypto_factory = pyarrow_unwrap_cryptofactory(crypto_factory) + self.c_config.get().kms_connection_config = pyarrow_unwrap_kmsconnectionconfig( + kms_connection_config) + self.c_config.get().encryption_config = c_encryption_config + + @staticmethod + cdef wrap(shared_ptr[CParquetEncryptionConfig] c_config): + cdef ParquetEncryptionConfig python_config = ParquetEncryptionConfig.__new__(ParquetEncryptionConfig) + python_config.c_config = c_config + return python_config + + cdef shared_ptr[CParquetEncryptionConfig] unwrap(self): + return self.c_config + + +cdef class ParquetDecryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level decryption + within the Parquet framework. + + ParquetDecryptionConfig is designed to pass decryption-related parameters to + the appropriate decryption components within the Parquet library. It holds references to + objects that define the decryption strategy, Key Management Service (KMS) configuration, + and specific decryption configurations for reading encrypted Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic + components for the decryption process. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary + for connecting to a Key Management Service (KMS) during decryption. + decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration + Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings + for reading encrypted Parquet data. + + Raises + ------ + ValueError + Raised if `decryption_config` is None. + """ + + cdef: + shared_ptr[CParquetDecryptionConfig] c_config + + # Avoid mistakingly creating attributes + __slots__ = () + + def __cinit__(self, CryptoFactory crypto_factory, KmsConnectionConfig kms_connection_config, + DecryptionConfiguration decryption_config): + + cdef shared_ptr[CDecryptionConfiguration] c_decryption_config + + if decryption_config is None: + raise ValueError( + "decryption_config cannot be None") + + self.c_config.reset(new CParquetDecryptionConfig()) + + c_decryption_config = pyarrow_unwrap_decryptionconfig( + decryption_config) + + self.c_config.get().crypto_factory = pyarrow_unwrap_cryptofactory(crypto_factory) + self.c_config.get().kms_connection_config = pyarrow_unwrap_kmsconnectionconfig( + kms_connection_config) + self.c_config.get().decryption_config = c_decryption_config + + @staticmethod + cdef wrap(shared_ptr[CParquetDecryptionConfig] c_config): + cdef ParquetDecryptionConfig python_config = ParquetDecryptionConfig.__new__(ParquetDecryptionConfig) + python_config.c_config = c_config + return python_config + + cdef shared_ptr[CParquetDecryptionConfig] unwrap(self): + return self.c_config + + +def set_encryption_config( + ParquetFileWriteOptions opts not None, + ParquetEncryptionConfig config not None +): + cdef shared_ptr[CParquetEncryptionConfig] c_config = config.unwrap() + opts.parquet_options.parquet_encryption_config = c_config + + +def set_decryption_properties( + ParquetFragmentScanOptions opts not None, + FileDecryptionProperties config not None +): + cdef CReaderProperties* reader_props = opts.reader_properties() + reader_props.file_decryption_properties(config.unwrap()) + + +def set_decryption_config( + ParquetFragmentScanOptions opts not None, + ParquetDecryptionConfig config not None +): + cdef shared_ptr[CParquetDecryptionConfig] c_config = config.unwrap() + opts.parquet_options.parquet_decryption_config = c_config diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dlpack.pxi b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dlpack.pxi new file mode 100644 index 0000000000000000000000000000000000000000..c2f4cff64069195ad70f2ea271a842dfd166058c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_dlpack.pxi @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cimport cpython +from cpython.pycapsule cimport PyCapsule_New + + +cdef void dlpack_pycapsule_deleter(object dltensor) noexcept: + cdef DLManagedTensor* dlm_tensor + cdef PyObject* err_type + cdef PyObject* err_value + cdef PyObject* err_traceback + + # Do nothing if the capsule has been consumed + if cpython.PyCapsule_IsValid(dltensor, "used_dltensor"): + return + + # An exception may be in-flight, we must save it in case + # we create another one + cpython.PyErr_Fetch(&err_type, &err_value, &err_traceback) + + dlm_tensor = cpython.PyCapsule_GetPointer(dltensor, 'dltensor') + if dlm_tensor == NULL: + cpython.PyErr_WriteUnraisable(dltensor) + # The deleter can be NULL if there is no way for the caller + # to provide a reasonable destructor + elif dlm_tensor.deleter: + dlm_tensor.deleter(dlm_tensor) + assert (not cpython.PyErr_Occurred()) + + # Set the error indicator from err_type, err_value, err_traceback + cpython.PyErr_Restore(err_type, err_value, err_traceback) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_feather.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_feather.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..ba88cee466c06372ee0aeac74e06451f01ef10f8 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_feather.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_feather.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_feather.pyx new file mode 100644 index 0000000000000000000000000000000000000000..7dd61c9a986ff1044fb7b5c22a2f24725710afd7 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_feather.pyx @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# --------------------------------------------------------------------- +# Implement Feather file format + +# cython: profile=False +# distutils: language = c++ +# cython: language_level=3 + +from cython.operator cimport dereference as deref +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_feather cimport * +from pyarrow.lib cimport (check_status, Table, _Weakrefable, + get_writer, get_reader, pyarrow_wrap_table) +from pyarrow.lib import tobytes + + +class FeatherError(Exception): + pass + + +def write_feather(Table table, object dest, compression=None, + compression_level=None, chunksize=None, version=2): + cdef shared_ptr[COutputStream] sink + get_writer(dest, &sink) + + cdef CFeatherProperties properties + if version == 2: + properties.version = kFeatherV2Version + else: + properties.version = kFeatherV1Version + + if compression == 'zstd': + properties.compression = CCompressionType_ZSTD + elif compression == 'lz4': + properties.compression = CCompressionType_LZ4_FRAME + else: + properties.compression = CCompressionType_UNCOMPRESSED + + if chunksize is not None: + properties.chunksize = chunksize + + if compression_level is not None: + properties.compression_level = compression_level + + with nogil: + check_status(WriteFeather(deref(table.table), sink.get(), + properties)) + + +cdef class FeatherReader(_Weakrefable): + cdef: + shared_ptr[CFeatherReader] reader + + def __cinit__(self, source, c_bool use_memory_map, c_bool use_threads): + cdef: + shared_ptr[CRandomAccessFile] reader + CIpcReadOptions options = CIpcReadOptions.Defaults() + options.use_threads = use_threads + + get_reader(source, use_memory_map, &reader) + with nogil: + self.reader = GetResultValue(CFeatherReader.Open(reader, options)) + + @property + def version(self): + return self.reader.get().version() + + def read(self): + cdef shared_ptr[CTable] sp_table + with nogil: + check_status(self.reader.get() + .Read(&sp_table)) + + return pyarrow_wrap_table(sp_table) + + def read_indices(self, indices): + cdef: + shared_ptr[CTable] sp_table + vector[int] c_indices + + for index in indices: + c_indices.push_back(index) + with nogil: + check_status(self.reader.get() + .Read(c_indices, &sp_table)) + + return pyarrow_wrap_table(sp_table) + + def read_names(self, names): + cdef: + shared_ptr[CTable] sp_table + vector[c_string] c_names + + for name in names: + c_names.push_back(tobytes(name)) + with nogil: + check_status(self.reader.get() + .Read(c_names, &sp_table)) + + return pyarrow_wrap_table(sp_table) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_flight.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_flight.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..d3cc267e7b033e5cf4f5e821508352d7be0073be --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_flight.cpython-312-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b1d36c09fd0e2ccad1f9f2aacc459d39c27c1d5e67272efeaf835b8ad8b868 +size 1295936 diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_flight.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_flight.pyx new file mode 100644 index 0000000000000000000000000000000000000000..8289215de2e29c6cd7e09affd7ec5d377ee0fa9c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_flight.pyx @@ -0,0 +1,3189 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +import collections +import enum +import re +import time +import warnings +import weakref + +from cython.operator cimport dereference as deref +from cython.operator cimport postincrement +from libcpp cimport bool as c_bool + +from pyarrow.lib cimport * +from pyarrow.lib import (ArrowCancelled, ArrowException, ArrowInvalid, + SignalStopHandler) +from pyarrow.lib import as_buffer, frombytes, tobytes +from pyarrow.includes.libarrow_flight cimport * +from pyarrow.ipc import _get_legacy_format_default, _ReadPandasMixin +import pyarrow.lib as lib + + +cdef CFlightCallOptions DEFAULT_CALL_OPTIONS + + +cdef int check_flight_status(const CStatus& status) except -1 nogil: + cdef shared_ptr[FlightStatusDetail] detail + + if status.ok(): + return 0 + + detail = FlightStatusDetail.UnwrapStatus(status) + if detail: + with gil: + message = frombytes(status.message(), safe=True) + detail_msg = detail.get().extra_info() + if detail.get().code() == CFlightStatusInternal: + raise FlightInternalError(message, detail_msg) + elif detail.get().code() == CFlightStatusFailed: + message = _munge_grpc_python_error(message) + raise FlightServerError(message, detail_msg) + elif detail.get().code() == CFlightStatusTimedOut: + raise FlightTimedOutError(message, detail_msg) + elif detail.get().code() == CFlightStatusCancelled: + raise FlightCancelledError(message, detail_msg) + elif detail.get().code() == CFlightStatusUnauthenticated: + raise FlightUnauthenticatedError(message, detail_msg) + elif detail.get().code() == CFlightStatusUnauthorized: + raise FlightUnauthorizedError(message, detail_msg) + elif detail.get().code() == CFlightStatusUnavailable: + raise FlightUnavailableError(message, detail_msg) + + size_detail = FlightWriteSizeStatusDetail.UnwrapStatus(status) + if size_detail: + with gil: + message = frombytes(status.message(), safe=True) + raise FlightWriteSizeExceededError( + message, + size_detail.get().limit(), size_detail.get().actual()) + + return check_status(status) + + +_FLIGHT_SERVER_ERROR_REGEX = re.compile( + r'Flight RPC failed with message: (.*). Detail: ' + r'Python exception: (.*)', + re.DOTALL +) + + +def _munge_grpc_python_error(message): + m = _FLIGHT_SERVER_ERROR_REGEX.match(message) + if m: + return ('Flight RPC failed with Python exception \"{}: {}\"' + .format(m.group(2), m.group(1))) + else: + return message + + +cdef IpcWriteOptions _get_options(options): + return _get_legacy_format_default( + use_legacy_format=None, options=options) + + +cdef class FlightCallOptions(_Weakrefable): + """RPC-layer options for a Flight call.""" + + cdef: + CFlightCallOptions options + + def __init__(self, timeout=None, write_options=None, headers=None, + IpcReadOptions read_options=None): + """Create call options. + + Parameters + ---------- + timeout : float, None + A timeout for the call, in seconds. None means that the + timeout defaults to an implementation-specific value. + write_options : pyarrow.ipc.IpcWriteOptions, optional + IPC write options. The default options can be controlled + by environment variables (see pyarrow.ipc). + headers : List[Tuple[str, str]], optional + A list of arbitrary headers as key, value tuples + read_options : pyarrow.ipc.IpcReadOptions, optional + Serialization options for reading IPC format. + """ + cdef IpcWriteOptions c_write_options + + if timeout is not None: + self.options.timeout = CTimeoutDuration(timeout) + if write_options is not None: + c_write_options = _get_options(write_options) + self.options.write_options = c_write_options.c_options + if read_options is not None: + if not isinstance(read_options, IpcReadOptions): + raise TypeError("expected IpcReadOptions, got {}" + .format(type(read_options))) + self.options.read_options = read_options.c_options + if headers is not None: + self.options.headers = headers + + @staticmethod + cdef CFlightCallOptions* unwrap(obj): + if not obj: + return &DEFAULT_CALL_OPTIONS + elif isinstance(obj, FlightCallOptions): + return &(( obj).options) + raise TypeError("Expected a FlightCallOptions object, not " + "'{}'".format(type(obj))) + + +_CertKeyPair = collections.namedtuple('_CertKeyPair', ['cert', 'key']) + + +class CertKeyPair(_CertKeyPair): + """A TLS certificate and key for use in Flight.""" + + +cdef class FlightError(Exception): + """ + The base class for Flight-specific errors. + + A server may raise this class or one of its subclasses to provide + a more detailed error to clients. + + Parameters + ---------- + message : str, optional + The error message. + extra_info : bytes, optional + Extra binary error details that were provided by the + server/will be sent to the client. + + Attributes + ---------- + extra_info : bytes + Extra binary error details that were provided by the + server/will be sent to the client. + """ + + cdef dict __dict__ + + def __init__(self, message='', extra_info=b''): + super().__init__(message) + self.extra_info = tobytes(extra_info) + + cdef CStatus to_status(self): + message = tobytes("Flight error: {}".format(str(self))) + return CStatus_UnknownError(message) + + +cdef class FlightInternalError(FlightError, ArrowException): + """An error internal to the Flight server occurred.""" + + cdef CStatus to_status(self): + return MakeFlightError(CFlightStatusInternal, + tobytes(str(self)), self.extra_info) + + +cdef class FlightTimedOutError(FlightError, ArrowException): + """The Flight RPC call timed out.""" + + cdef CStatus to_status(self): + return MakeFlightError(CFlightStatusTimedOut, + tobytes(str(self)), self.extra_info) + + +cdef class FlightCancelledError(FlightError, ArrowCancelled): + """The operation was cancelled.""" + + cdef CStatus to_status(self): + return MakeFlightError(CFlightStatusCancelled, tobytes(str(self)), + self.extra_info) + + +cdef class FlightServerError(FlightError, ArrowException): + """A server error occurred.""" + + cdef CStatus to_status(self): + return MakeFlightError(CFlightStatusFailed, tobytes(str(self)), + self.extra_info) + + +cdef class FlightUnauthenticatedError(FlightError, ArrowException): + """The client is not authenticated.""" + + cdef CStatus to_status(self): + return MakeFlightError( + CFlightStatusUnauthenticated, tobytes(str(self)), self.extra_info) + + +cdef class FlightUnauthorizedError(FlightError, ArrowException): + """The client is not authorized to perform the given operation.""" + + cdef CStatus to_status(self): + return MakeFlightError(CFlightStatusUnauthorized, tobytes(str(self)), + self.extra_info) + + +cdef class FlightUnavailableError(FlightError, ArrowException): + """The server is not reachable or available.""" + + cdef CStatus to_status(self): + return MakeFlightError(CFlightStatusUnavailable, tobytes(str(self)), + self.extra_info) + + +class FlightWriteSizeExceededError(ArrowInvalid): + """A write operation exceeded the client-configured limit.""" + + def __init__(self, message, limit, actual): + super().__init__(message) + self.limit = limit + self.actual = actual + + +cdef class Action(_Weakrefable): + """An action executable on a Flight service.""" + cdef: + CAction action + + def __init__(self, action_type, buf): + """Create an action from a type and a buffer. + + Parameters + ---------- + action_type : bytes or str + buf : Buffer or bytes-like object + """ + self.action.type = tobytes(action_type) + self.action.body = pyarrow_unwrap_buffer(as_buffer(buf)) + + @property + def type(self): + """The action type.""" + return frombytes(self.action.type) + + @property + def body(self): + """The action body (arguments for the action).""" + return pyarrow_wrap_buffer(self.action.body) + + @staticmethod + cdef CAction unwrap(action) except *: + if not isinstance(action, Action): + raise TypeError("Must provide Action, not '{}'".format( + type(action))) + return ( action).action + + def serialize(self): + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + return GetResultValue(self.action.SerializeToString()) + + @classmethod + def deserialize(cls, serialized): + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + cdef Action action = Action.__new__(Action) + action.action = GetResultValue( + CAction.Deserialize(tobytes(serialized))) + return action + + def __eq__(self, Action other): + return self.action == other.action + + def __repr__(self): + return (f"") + + +_ActionType = collections.namedtuple('_ActionType', ['type', 'description']) + + +class ActionType(_ActionType): + """A type of action that is executable on a Flight service.""" + + def make_action(self, buf): + """Create an Action with this type. + + Parameters + ---------- + buf : obj + An Arrow buffer or Python bytes or bytes-like object. + """ + return Action(self.type, buf) + + +cdef class Result(_Weakrefable): + """A result from executing an Action.""" + cdef: + unique_ptr[CFlightResult] result + + def __init__(self, buf): + """Create a new result. + + Parameters + ---------- + buf : Buffer or bytes-like object + """ + self.result.reset(new CFlightResult()) + self.result.get().body = pyarrow_unwrap_buffer(as_buffer(buf)) + + @property + def body(self): + """Get the Buffer containing the result.""" + return pyarrow_wrap_buffer(self.result.get().body) + + def serialize(self): + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + return GetResultValue(self.result.get().SerializeToString()) + + @classmethod + def deserialize(cls, serialized): + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + cdef Result result = Result.__new__(Result) + result.result.reset(new CFlightResult(GetResultValue( + CFlightResult.Deserialize(tobytes(serialized))))) + return result + + def __eq__(self, Result other): + return deref(self.result.get()) == deref(other.result.get()) + + def __repr__(self): + return f"" + + +cdef class BasicAuth(_Weakrefable): + """A container for basic auth.""" + cdef: + unique_ptr[CBasicAuth] basic_auth + + def __init__(self, username=None, password=None): + """Create a new basic auth object. + + Parameters + ---------- + username : string + password : string + """ + self.basic_auth.reset(new CBasicAuth()) + if username: + self.basic_auth.get().username = tobytes(username) + if password: + self.basic_auth.get().password = tobytes(password) + + @property + def username(self): + """Get the username.""" + return self.basic_auth.get().username + + @property + def password(self): + """Get the password.""" + return self.basic_auth.get().password + + @staticmethod + def deserialize(serialized): + auth = BasicAuth() + auth.basic_auth.reset(new CBasicAuth(GetResultValue( + CBasicAuth.Deserialize(tobytes(serialized))))) + return auth + + def serialize(self): + return GetResultValue(self.basic_auth.get().SerializeToString()) + + def __eq__(self, BasicAuth other): + return deref(self.basic_auth.get()) == deref(other.basic_auth.get()) + + def __repr__(self): + return (f"") + + +class DescriptorType(enum.Enum): + """ + The type of a FlightDescriptor. + + Attributes + ---------- + + UNKNOWN + An unknown descriptor type. + + PATH + A Flight stream represented by a path. + + CMD + A Flight stream represented by an application-defined command. + + """ + + UNKNOWN = 0 + PATH = 1 + CMD = 2 + + +class FlightMethod(enum.Enum): + """The implemented methods in Flight.""" + + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + + +cdef wrap_flight_method(CFlightMethod method): + if method == CFlightMethodHandshake: + return FlightMethod.HANDSHAKE + elif method == CFlightMethodListFlights: + return FlightMethod.LIST_FLIGHTS + elif method == CFlightMethodGetFlightInfo: + return FlightMethod.GET_FLIGHT_INFO + elif method == CFlightMethodGetSchema: + return FlightMethod.GET_SCHEMA + elif method == CFlightMethodDoGet: + return FlightMethod.DO_GET + elif method == CFlightMethodDoPut: + return FlightMethod.DO_PUT + elif method == CFlightMethodDoAction: + return FlightMethod.DO_ACTION + elif method == CFlightMethodListActions: + return FlightMethod.LIST_ACTIONS + elif method == CFlightMethodDoExchange: + return FlightMethod.DO_EXCHANGE + return FlightMethod.INVALID + + +cdef class FlightDescriptor(_Weakrefable): + """A description of a data stream available from a Flight service.""" + cdef: + CFlightDescriptor descriptor + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly, use " + "`pyarrow.flight.FlightDescriptor.for_{path,command}` " + "function instead." + .format(self.__class__.__name__)) + + @staticmethod + def for_path(*path): + """Create a FlightDescriptor for a resource path.""" + cdef FlightDescriptor result = \ + FlightDescriptor.__new__(FlightDescriptor) + result.descriptor.type = CDescriptorTypePath + result.descriptor.path = [tobytes(p) for p in path] + return result + + @staticmethod + def for_command(command): + """Create a FlightDescriptor for an opaque command.""" + cdef FlightDescriptor result = \ + FlightDescriptor.__new__(FlightDescriptor) + result.descriptor.type = CDescriptorTypeCmd + result.descriptor.cmd = tobytes(command) + return result + + @property + def descriptor_type(self): + """Get the type of this descriptor.""" + if self.descriptor.type == CDescriptorTypeUnknown: + return DescriptorType.UNKNOWN + elif self.descriptor.type == CDescriptorTypePath: + return DescriptorType.PATH + elif self.descriptor.type == CDescriptorTypeCmd: + return DescriptorType.CMD + raise RuntimeError("Invalid descriptor type!") + + @property + def command(self): + """Get the command for this descriptor.""" + if self.descriptor_type != DescriptorType.CMD: + return None + return self.descriptor.cmd + + @property + def path(self): + """Get the path for this descriptor.""" + if self.descriptor_type != DescriptorType.PATH: + return None + return self.descriptor.path + + def __repr__(self): + if self.descriptor_type == DescriptorType.PATH: + return f"" + elif self.descriptor_type == DescriptorType.CMD: + return f"" + else: + return "" + + @staticmethod + cdef CFlightDescriptor unwrap(descriptor) except *: + if not isinstance(descriptor, FlightDescriptor): + raise TypeError("Must provide a FlightDescriptor, not '{}'".format( + type(descriptor))) + return ( descriptor).descriptor + + def serialize(self): + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + return GetResultValue(self.descriptor.SerializeToString()) + + @classmethod + def deserialize(cls, serialized): + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + cdef FlightDescriptor descriptor = \ + FlightDescriptor.__new__(FlightDescriptor) + descriptor.descriptor = GetResultValue( + CFlightDescriptor.Deserialize(tobytes(serialized))) + return descriptor + + def __eq__(self, FlightDescriptor other): + return self.descriptor == other.descriptor + + +cdef class Ticket(_Weakrefable): + """A ticket for requesting a Flight stream.""" + + cdef: + CTicket c_ticket + + def __init__(self, ticket): + self.c_ticket.ticket = tobytes(ticket) + + @property + def ticket(self): + return self.c_ticket.ticket + + def serialize(self): + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + return GetResultValue(self.c_ticket.SerializeToString()) + + @classmethod + def deserialize(cls, serialized): + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + cdef Ticket ticket = Ticket.__new__(Ticket) + ticket.c_ticket = GetResultValue( + CTicket.Deserialize(tobytes(serialized))) + return ticket + + def __eq__(self, Ticket other): + return self.c_ticket == other.c_ticket + + def __repr__(self): + return f"" + + +cdef class Location(_Weakrefable): + """The location of a Flight service.""" + cdef: + CLocation location + + def __init__(self, uri): + check_flight_status(CLocation.Parse(tobytes(uri)).Value(&self.location)) + + def __repr__(self): + return f'' + + @property + def uri(self): + return self.location.ToString() + + def equals(self, Location other): + return self == other + + def __eq__(self, other): + if not isinstance(other, Location): + return NotImplemented + return self.location.Equals(( other).location) + + @staticmethod + def for_grpc_tcp(host, port): + """Create a Location for a TCP-based gRPC service.""" + cdef: + c_string c_host = tobytes(host) + int c_port = port + Location result = Location.__new__(Location) + check_flight_status( + CLocation.ForGrpcTcp(c_host, c_port).Value(&result.location)) + return result + + @staticmethod + def for_grpc_tls(host, port): + """Create a Location for a TLS-based gRPC service.""" + cdef: + c_string c_host = tobytes(host) + int c_port = port + Location result = Location.__new__(Location) + check_flight_status( + CLocation.ForGrpcTls(c_host, c_port).Value(&result.location)) + return result + + @staticmethod + def for_grpc_unix(path): + """Create a Location for a domain socket-based gRPC service.""" + cdef: + c_string c_path = tobytes(path) + Location result = Location.__new__(Location) + check_flight_status(CLocation.ForGrpcUnix(c_path).Value(&result.location)) + return result + + @staticmethod + cdef Location wrap(CLocation location): + cdef Location result = Location.__new__(Location) + result.location = location + return result + + @staticmethod + cdef CLocation unwrap(object location) except *: + cdef CLocation c_location + if isinstance(location, str): + check_flight_status( + CLocation.Parse(tobytes(location)).Value(&c_location)) + return c_location + elif not isinstance(location, Location): + raise TypeError("Must provide a Location, not '{}'".format( + type(location))) + return ( location).location + + +cdef class FlightEndpoint(_Weakrefable): + """A Flight stream, along with the ticket and locations to access it.""" + cdef: + CFlightEndpoint endpoint + + def __init__(self, ticket, locations): + """Create a FlightEndpoint from a ticket and list of locations. + + Parameters + ---------- + ticket : Ticket or bytes + the ticket needed to access this flight + locations : list of string URIs + locations where this flight is available + + Raises + ------ + ArrowException + If one of the location URIs is not a valid URI. + """ + cdef: + CLocation c_location + + if isinstance(ticket, Ticket): + self.endpoint.ticket.ticket = tobytes(ticket.ticket) + else: + self.endpoint.ticket.ticket = tobytes(ticket) + + for location in locations: + if isinstance(location, Location): + c_location = ( location).location + else: + c_location = CLocation() + check_flight_status( + CLocation.Parse(tobytes(location)).Value(&c_location)) + self.endpoint.locations.push_back(c_location) + + @property + def ticket(self): + """Get the ticket in this endpoint.""" + return Ticket(self.endpoint.ticket.ticket) + + @property + def locations(self): + return [Location.wrap(location) + for location in self.endpoint.locations] + + def serialize(self): + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + return GetResultValue(self.endpoint.SerializeToString()) + + @classmethod + def deserialize(cls, serialized): + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + cdef FlightEndpoint endpoint = FlightEndpoint.__new__(FlightEndpoint) + endpoint.endpoint = GetResultValue( + CFlightEndpoint.Deserialize(tobytes(serialized))) + return endpoint + + def __repr__(self): + return (f"") + + def __eq__(self, FlightEndpoint other): + return self.endpoint == other.endpoint + + +cdef class SchemaResult(_Weakrefable): + """The serialized schema returned from a GetSchema request.""" + cdef: + unique_ptr[CSchemaResult] result + + def __init__(self, Schema schema): + """Create a SchemaResult from a schema. + + Parameters + ---------- + schema: Schema + the schema of the data in this flight. + """ + cdef: + shared_ptr[CSchema] c_schema = pyarrow_unwrap_schema(schema) + check_flight_status(CreateSchemaResult(c_schema, &self.result)) + + @property + def schema(self): + """The schema of the data in this flight.""" + cdef: + shared_ptr[CSchema] schema + CDictionaryMemo dummy_memo + + check_flight_status(self.result.get().GetSchema(&dummy_memo).Value(&schema)) + return pyarrow_wrap_schema(schema) + + def serialize(self): + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + return GetResultValue(self.result.get().SerializeToString()) + + @classmethod + def deserialize(cls, serialized): + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + cdef SchemaResult result = SchemaResult.__new__(SchemaResult) + result.result.reset(new CSchemaResult(GetResultValue( + CSchemaResult.Deserialize(tobytes(serialized))))) + return result + + def __eq__(self, SchemaResult other): + return deref(self.result.get()) == deref(other.result.get()) + + def __repr__(self): + return f"" + + +cdef class FlightInfo(_Weakrefable): + """A description of a Flight stream.""" + cdef: + unique_ptr[CFlightInfo] info + + @staticmethod + cdef wrap(CFlightInfo c_info): + cdef FlightInfo obj = FlightInfo.__new__(FlightInfo) + obj.info.reset(new CFlightInfo(move(c_info))) + return obj + + def __init__(self, Schema schema, FlightDescriptor descriptor, endpoints, + total_records, total_bytes): + """Create a FlightInfo object from a schema, descriptor, and endpoints. + + Parameters + ---------- + schema : Schema + the schema of the data in this flight. + descriptor : FlightDescriptor + the descriptor for this flight. + endpoints : list of FlightEndpoint + a list of endpoints where this flight is available. + total_records : int + the total records in this flight, or -1 if unknown + total_bytes : int + the total bytes in this flight, or -1 if unknown + """ + cdef: + shared_ptr[CSchema] c_schema = pyarrow_unwrap_schema(schema) + vector[CFlightEndpoint] c_endpoints + + for endpoint in endpoints: + if isinstance(endpoint, FlightEndpoint): + c_endpoints.push_back(( endpoint).endpoint) + else: + raise TypeError('Endpoint {} is not instance of' + ' FlightEndpoint'.format(endpoint)) + + check_flight_status(CreateFlightInfo(c_schema, + descriptor.descriptor, + c_endpoints, + total_records, + total_bytes, &self.info)) + + @property + def total_records(self): + """The total record count of this flight, or -1 if unknown.""" + return self.info.get().total_records() + + @property + def total_bytes(self): + """The size in bytes of the data in this flight, or -1 if unknown.""" + return self.info.get().total_bytes() + + @property + def schema(self): + """The schema of the data in this flight.""" + cdef: + shared_ptr[CSchema] schema + CDictionaryMemo dummy_memo + + check_flight_status(self.info.get().GetSchema(&dummy_memo).Value(&schema)) + return pyarrow_wrap_schema(schema) + + @property + def descriptor(self): + """The descriptor of the data in this flight.""" + cdef FlightDescriptor result = \ + FlightDescriptor.__new__(FlightDescriptor) + result.descriptor = self.info.get().descriptor() + return result + + @property + def endpoints(self): + """The endpoints where this flight is available.""" + # TODO: get Cython to iterate over reference directly + cdef: + vector[CFlightEndpoint] endpoints = self.info.get().endpoints() + FlightEndpoint py_endpoint + + result = [] + for endpoint in endpoints: + py_endpoint = FlightEndpoint.__new__(FlightEndpoint) + py_endpoint.endpoint = endpoint + result.append(py_endpoint) + return result + + def serialize(self): + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + return GetResultValue(self.info.get().SerializeToString()) + + @classmethod + def deserialize(cls, serialized): + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + cdef FlightInfo info = FlightInfo.__new__(FlightInfo) + info.info = move(GetResultValue( + CFlightInfo.Deserialize(tobytes(serialized)))) + return info + + def __eq__(self, FlightInfo other): + return deref(self.info.get()) == deref(other.info.get()) + + def __repr__(self): + return (f"") + + +cdef class FlightStreamChunk(_Weakrefable): + """A RecordBatch with application metadata on the side.""" + cdef: + CFlightStreamChunk chunk + + @property + def data(self): + if self.chunk.data == NULL: + return None + return pyarrow_wrap_batch(self.chunk.data) + + @property + def app_metadata(self): + if self.chunk.app_metadata == NULL: + return None + return pyarrow_wrap_buffer(self.chunk.app_metadata) + + def __iter__(self): + return iter((self.data, self.app_metadata)) + + def __repr__(self): + return "".format( + self.chunk.data != NULL, self.chunk.app_metadata != NULL) + + +cdef class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + """A reader for Flight streams.""" + + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + + cdef dict __dict__ + cdef shared_ptr[CMetadataRecordBatchReader] reader + + def __iter__(self): + return self + + def __next__(self): + return self.read_chunk() + + @property + def schema(self): + """Get the schema for this reader.""" + cdef shared_ptr[CSchema] c_schema + with nogil: + check_flight_status(self.reader.get().GetSchema().Value(&c_schema)) + return pyarrow_wrap_schema(c_schema) + + def read_all(self): + """Read the entire contents of the stream as a Table.""" + cdef: + shared_ptr[CTable] c_table + with nogil: + check_flight_status(self.reader.get().ToTable().Value(&c_table)) + return pyarrow_wrap_table(c_table) + + def read_chunk(self): + """Read the next FlightStreamChunk along with any metadata. + + Returns + ------- + chunk : FlightStreamChunk + The next FlightStreamChunk in the stream. + + Raises + ------ + StopIteration + when the stream is finished + """ + cdef: + FlightStreamChunk chunk = FlightStreamChunk() + + with nogil: + check_flight_status(self.reader.get().Next().Value(&chunk.chunk)) + + if chunk.chunk.data == NULL and chunk.chunk.app_metadata == NULL: + raise StopIteration + + return chunk + + def to_reader(self): + """Convert this reader into a regular RecordBatchReader. + + This may fail if the schema cannot be read from the remote end. + + Returns + ------- + RecordBatchReader + """ + cdef RecordBatchReader reader + reader = RecordBatchReader.__new__(RecordBatchReader) + with nogil: + reader.reader = GetResultValue(MakeRecordBatchReader(self.reader)) + + return reader + + +cdef class MetadataRecordBatchReader(_MetadataRecordBatchReader): + """The base class for readers for Flight streams. + + See Also + -------- + FlightStreamReader + """ + + +cdef class FlightStreamReader(MetadataRecordBatchReader): + """A reader that can also be canceled.""" + + def cancel(self): + """Cancel the read operation.""" + with nogil: + ( self.reader.get()).Cancel() + + def read_all(self): + """Read the entire contents of the stream as a Table.""" + cdef: + shared_ptr[CTable] c_table + CStopToken stop_token + with SignalStopHandler() as stop_handler: + stop_token = ( stop_handler.stop_token).stop_token + with nogil: + check_flight_status( + ( self.reader.get()) + .ToTableWithStopToken(stop_token).Value(&c_table)) + return pyarrow_wrap_table(c_table) + + +cdef class MetadataRecordBatchWriter(_CRecordBatchWriter): + """A RecordBatchWriter that also allows writing application metadata. + + This class is a context manager; on exit, close() will be called. + """ + + cdef CMetadataRecordBatchWriter* _writer(self) nogil: + return self.writer.get() + + def begin(self, schema: Schema, options=None): + """Prepare to write data to this stream with the given schema.""" + cdef: + shared_ptr[CSchema] c_schema = pyarrow_unwrap_schema(schema) + CIpcWriteOptions c_options = _get_options(options).c_options + with nogil: + check_flight_status(self._writer().Begin(c_schema, c_options)) + + def write_metadata(self, buf): + """Write Flight metadata by itself.""" + cdef shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(as_buffer(buf)) + with nogil: + check_flight_status( + self._writer().WriteMetadata(c_buf)) + + def write_batch(self, RecordBatch batch): + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + """ + cdef: + shared_ptr[const CKeyValueMetadata] custom_metadata + + # Override superclass method to use check_flight_status so we + # can generate FlightWriteSizeExceededError. We don't do this + # for write_table as callers who intend to handle the error + # and retry with a smaller batch should be working with + # individual batches to have control. + + with nogil: + check_flight_status( + self._writer().WriteRecordBatch(deref(batch.batch), custom_metadata)) + + def write_table(self, Table table, max_chunksize=None, **kwargs): + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + cdef: + # max_chunksize must be > 0 to have any impact + int64_t c_max_chunksize = -1 + + if 'chunksize' in kwargs: + max_chunksize = kwargs['chunksize'] + msg = ('The parameter chunksize is deprecated for the write_table ' + 'methods as of 0.15, please use parameter ' + 'max_chunksize instead') + warnings.warn(msg, FutureWarning) + + if max_chunksize is not None: + c_max_chunksize = max_chunksize + + with nogil: + check_flight_status( + self._writer().WriteTable(table.table[0], c_max_chunksize)) + + def close(self): + """ + Close stream and write end-of-stream 0 marker. + """ + with nogil: + check_flight_status(self._writer().Close()) + + def write_with_metadata(self, RecordBatch batch, buf): + """Write a RecordBatch along with Flight metadata. + + Parameters + ---------- + batch : RecordBatch + The next RecordBatch in the stream. + buf : Buffer + Application-specific metadata for the batch as defined by + Flight. + """ + cdef shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(as_buffer(buf)) + with nogil: + check_flight_status( + self._writer().WriteWithMetadata(deref(batch.batch), c_buf)) + + +cdef class FlightStreamWriter(MetadataRecordBatchWriter): + """A writer that also allows closing the write side of a stream.""" + + def done_writing(self): + """Indicate that the client is done writing, but not done reading.""" + with nogil: + check_flight_status( + ( self.writer.get()).DoneWriting()) + + +cdef class FlightMetadataReader(_Weakrefable): + """A reader for Flight metadata messages sent during a DoPut.""" + + cdef: + unique_ptr[CFlightMetadataReader] reader + + def read(self): + """Read the next metadata message.""" + cdef shared_ptr[CBuffer] buf + with nogil: + check_flight_status(self.reader.get().ReadMetadata(&buf)) + if buf == NULL: + return None + return pyarrow_wrap_buffer(buf) + + +cdef class FlightMetadataWriter(_Weakrefable): + """A sender for Flight metadata messages during a DoPut.""" + + cdef: + unique_ptr[CFlightMetadataWriter] writer + + def write(self, message): + """Write the next metadata message. + + Parameters + ---------- + message : Buffer + """ + cdef shared_ptr[CBuffer] buf = \ + pyarrow_unwrap_buffer(as_buffer(message)) + with nogil: + check_flight_status(self.writer.get().WriteMetadata(deref(buf))) + + +class AsyncioCall: + """State for an async RPC using asyncio.""" + + def __init__(self) -> None: + import asyncio + self._future = asyncio.get_running_loop().create_future() + + def as_awaitable(self) -> object: + return self._future + + def wakeup(self, result_or_exception) -> None: + # Mark the Future done from within its loop (asyncio + # objects are generally not thread-safe) + loop = self._future.get_loop() + if isinstance(result_or_exception, BaseException): + loop.call_soon_threadsafe( + self._future.set_exception, result_or_exception) + else: + loop.call_soon_threadsafe( + self._future.set_result, result_or_exception) + + +cdef class AsyncioFlightClient: + """ + A FlightClient with an asyncio-based async interface. + + This interface is EXPERIMENTAL. + """ + + cdef: + FlightClient _client + + def __init__(self, FlightClient client) -> None: + self._client = client + + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions = None, + ): + call = AsyncioCall() + self._get_flight_info(call, descriptor, options) + return await call.as_awaitable() + + cdef _get_flight_info(self, call, descriptor, options): + cdef: + CFlightCallOptions* c_options = \ + FlightCallOptions.unwrap(options) + CFlightDescriptor c_descriptor = \ + FlightDescriptor.unwrap(descriptor) + CFuture[CFlightInfo] c_future + + with nogil: + c_future = self._client.client.get().GetFlightInfoAsync( + deref(c_options), c_descriptor) + + BindFuture(move(c_future), call.wakeup, FlightInfo.wrap) + + +cdef class FlightClient(_Weakrefable): + """A client to a Flight service. + + Connect to a Flight service on the given host and port. + + Parameters + ---------- + location : str, tuple or Location + Location to connect to. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + tls_root_certs : bytes or None + PEM-encoded + cert_chain: bytes or None + Client certificate if using mutual TLS + private_key: bytes or None + Client private key for cert_chain is using mutual TLS + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list optional, default None + A list of ClientMiddlewareFactory instances. + write_size_limit_bytes : int optional, default None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean optional, default False + A flag that indicates that, if the client is connecting + with TLS, that it skips server verification. If this is + enabled, all other TLS settings are overridden. + generic_options : list optional, default None + A list of generic (string, int or string) option tuples passed + to the underlying transport. Effect is implementation + dependent. + """ + cdef: + unique_ptr[CFlightClient] client + + def __init__(self, location, *, tls_root_certs=None, cert_chain=None, + private_key=None, override_hostname=None, middleware=None, + write_size_limit_bytes=None, + disable_server_verification=None, generic_options=None): + if isinstance(location, (bytes, str)): + location = Location(location) + elif isinstance(location, tuple): + host, port = location + if tls_root_certs or disable_server_verification is not None: + location = Location.for_grpc_tls(host, port) + else: + location = Location.for_grpc_tcp(host, port) + elif not isinstance(location, Location): + raise TypeError('`location` argument must be a string, tuple or a ' + 'Location instance') + self.init(location, tls_root_certs, cert_chain, private_key, + override_hostname, middleware, write_size_limit_bytes, + disable_server_verification, generic_options) + + cdef init(self, Location location, tls_root_certs, cert_chain, + private_key, override_hostname, middleware, + write_size_limit_bytes, disable_server_verification, + generic_options): + cdef: + CLocation c_location = Location.unwrap(location) + CFlightClientOptions c_options = CFlightClientOptions.Defaults() + function[cb_client_middleware_start_call] start_call = \ + &_client_middleware_start_call + CIntStringVariant variant + + if tls_root_certs: + c_options.tls_root_certs = tobytes(tls_root_certs) + if cert_chain: + c_options.cert_chain = tobytes(cert_chain) + if private_key: + c_options.private_key = tobytes(private_key) + if override_hostname: + c_options.override_hostname = tobytes(override_hostname) + if disable_server_verification is not None: + c_options.disable_server_verification = disable_server_verification + if middleware: + for factory in middleware: + c_options.middleware.push_back( + + make_shared[CPyClientMiddlewareFactory]( + factory, start_call)) + if write_size_limit_bytes is not None: + c_options.write_size_limit_bytes = write_size_limit_bytes + else: + c_options.write_size_limit_bytes = 0 + if generic_options: + for key, value in generic_options: + if isinstance(value, (str, bytes)): + variant = CIntStringVariant( tobytes(value)) + else: + variant = CIntStringVariant( value) + c_options.generic_options.push_back( + pair[c_string, CIntStringVariant](tobytes(key), variant)) + + with nogil: + check_flight_status(CFlightClient.Connect(c_location, c_options + ).Value(&self.client)) + + @property + def supports_async(self): + return self.client.get().supports_async() + + def as_async(self) -> None: + check_status(self.client.get().CheckAsyncSupport()) + return AsyncioFlightClient(self) + + def wait_for_available(self, timeout=5): + """Block until the server can be contacted. + + Parameters + ---------- + timeout : int, default 5 + The maximum seconds to wait. + """ + deadline = time.time() + timeout + while True: + try: + list(self.list_flights()) + except FlightUnavailableError: + if time.time() < deadline: + time.sleep(0.025) + continue + else: + raise + except NotImplementedError: + # allow if list_flights is not implemented, because + # the server can be contacted nonetheless + break + else: + break + + @classmethod + def connect(cls, location, tls_root_certs=None, cert_chain=None, + private_key=None, override_hostname=None, + disable_server_verification=None): + """Connect to a Flight server. + + .. deprecated:: 0.15.0 + Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. + """ + warnings.warn("The 'FlightClient.connect' method is deprecated, use " + "FlightClient constructor or pyarrow.flight.connect " + "function instead") + return FlightClient( + location, tls_root_certs=tls_root_certs, + cert_chain=cert_chain, private_key=private_key, + override_hostname=override_hostname, + disable_server_verification=disable_server_verification + ) + + def authenticate(self, auth_handler, options: FlightCallOptions = None): + """Authenticate to the server. + + Parameters + ---------- + auth_handler : ClientAuthHandler + The authentication mechanism to use. + options : FlightCallOptions + Options for this call. + """ + cdef: + unique_ptr[CClientAuthHandler] handler + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + + if not isinstance(auth_handler, ClientAuthHandler): + raise TypeError( + "FlightClient.authenticate takes a ClientAuthHandler, " + "not '{}'".format(type(auth_handler))) + handler.reset(( auth_handler).to_handler()) + with nogil: + check_flight_status( + self.client.get().Authenticate(deref(c_options), + move(handler))) + + def authenticate_basic_token(self, username, password, + options: FlightCallOptions = None): + """Authenticate to the server with HTTP basic authentication. + + Parameters + ---------- + username : string + Username to authenticate with + password : string + Password to authenticate with + options : FlightCallOptions + Options for this call + + Returns + ------- + tuple : Tuple[str, str] + A tuple representing the FlightCallOptions authorization + header entry of a bearer token. + """ + cdef: + CResult[pair[c_string, c_string]] result + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + c_string user = tobytes(username) + c_string pw = tobytes(password) + + with nogil: + result = self.client.get().AuthenticateBasicToken(deref(c_options), + user, pw) + check_flight_status(result.status()) + + return GetResultValue(result) + + def list_actions(self, options: FlightCallOptions = None): + """List the actions available on a service.""" + cdef: + vector[CActionType] results + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + + with SignalStopHandler() as stop_handler: + c_options.stop_token = \ + ( stop_handler.stop_token).stop_token + with nogil: + check_flight_status( + self.client.get().ListActions(deref(c_options)).Value(&results)) + + result = [] + for action_type in results: + py_action = ActionType(frombytes(action_type.type), + frombytes(action_type.description)) + result.append(py_action) + + return result + + def do_action(self, action, options: FlightCallOptions = None): + """ + Execute an action on a service. + + Parameters + ---------- + action : str, tuple, or Action + Can be action type name (no body), type and body, or any Action + object + options : FlightCallOptions + RPC options + + Returns + ------- + results : iterator of Result values + """ + cdef: + unique_ptr[CResultStream] results + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + + if isinstance(action, (str, bytes)): + action = Action(action, b'') + elif isinstance(action, tuple): + action = Action(*action) + elif not isinstance(action, Action): + raise TypeError("Action must be Action instance, string, or tuple") + + cdef CAction c_action = Action.unwrap( action) + with nogil: + check_flight_status( + self.client.get().DoAction( + deref(c_options), c_action).Value(&results)) + + def _do_action_response(): + cdef: + Result result + while True: + result = Result.__new__(Result) + with nogil: + check_flight_status(results.get().Next().Value(&result.result)) + if result.result == NULL: + break + yield result + return _do_action_response() + + def list_flights(self, criteria: bytes = None, + options: FlightCallOptions = None): + """List the flights available on a service.""" + cdef: + unique_ptr[CFlightListing] listing + FlightInfo result + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + CCriteria c_criteria + + if criteria: + c_criteria.expression = tobytes(criteria) + + with SignalStopHandler() as stop_handler: + c_options.stop_token = \ + ( stop_handler.stop_token).stop_token + with nogil: + check_flight_status( + self.client.get().ListFlights(deref(c_options), + c_criteria).Value(&listing)) + + while True: + result = FlightInfo.__new__(FlightInfo) + with nogil: + check_flight_status(listing.get().Next().Value(&result.info)) + if result.info == NULL: + break + yield result + + def get_flight_info(self, descriptor: FlightDescriptor, + options: FlightCallOptions = None): + """Request information about an available flight.""" + cdef: + FlightInfo result = FlightInfo.__new__(FlightInfo) + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + CFlightDescriptor c_descriptor = \ + FlightDescriptor.unwrap(descriptor) + + with nogil: + check_flight_status(self.client.get().GetFlightInfo( + deref(c_options), c_descriptor).Value(&result.info)) + + return result + + def get_schema(self, descriptor: FlightDescriptor, + options: FlightCallOptions = None): + """Request schema for an available flight.""" + cdef: + SchemaResult result = SchemaResult.__new__(SchemaResult) + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + CFlightDescriptor c_descriptor = \ + FlightDescriptor.unwrap(descriptor) + with nogil: + check_status( + self.client.get() + .GetSchema(deref(c_options), c_descriptor).Value(&result.result) + ) + + return result + + def do_get(self, ticket: Ticket, options: FlightCallOptions = None): + """Request the data for a flight. + + Returns + ------- + reader : FlightStreamReader + """ + cdef: + unique_ptr[CFlightStreamReader] reader + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + + with nogil: + check_flight_status( + self.client.get().DoGet( + deref(c_options), ticket.c_ticket).Value(&reader)) + result = FlightStreamReader() + result.reader.reset(reader.release()) + return result + + def do_put(self, descriptor: FlightDescriptor, Schema schema not None, + options: FlightCallOptions = None): + """Upload data to a flight. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightMetadataReader + """ + cdef: + shared_ptr[CSchema] c_schema = pyarrow_unwrap_schema(schema) + CDoPutResult c_do_put_result + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + CFlightDescriptor c_descriptor = \ + FlightDescriptor.unwrap(descriptor) + + with nogil: + check_flight_status(self.client.get().DoPut( + deref(c_options), + c_descriptor, + c_schema).Value(&c_do_put_result)) + py_writer = FlightStreamWriter() + py_writer.writer.reset(c_do_put_result.writer.release()) + py_reader = FlightMetadataReader() + py_reader.reader.reset(c_do_put_result.reader.release()) + return py_writer, py_reader + + def do_exchange(self, descriptor: FlightDescriptor, + options: FlightCallOptions = None): + """Start a bidirectional data exchange with a server. + + Parameters + ---------- + descriptor : FlightDescriptor + A descriptor for the flight. + options : FlightCallOptions + RPC options. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightStreamReader + """ + cdef: + CDoExchangeResult c_do_exchange_result + CFlightCallOptions* c_options = FlightCallOptions.unwrap(options) + CFlightDescriptor c_descriptor = \ + FlightDescriptor.unwrap(descriptor) + + with nogil: + check_flight_status(self.client.get().DoExchange( + deref(c_options), + c_descriptor).Value(&c_do_exchange_result)) + py_writer = FlightStreamWriter() + py_writer.writer.reset(c_do_exchange_result.writer.release()) + py_reader = FlightStreamReader() + py_reader.reader.reset(c_do_exchange_result.reader.release()) + return py_writer, py_reader + + def close(self): + """Close the client and disconnect.""" + client = self.client.get() + if client != NULL: + check_flight_status(client.Close()) + + def __del__(self): + # Not ideal, but close() wasn't originally present so + # applications may not be calling it + self.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + +cdef class FlightDataStream(_Weakrefable): + """ + Abstract base class for Flight data streams. + + See Also + -------- + RecordBatchStream + GeneratorStream + """ + + cdef CFlightDataStream* to_stream(self) except *: + """Create the C++ data stream for the backing Python object. + + We don't expose the C++ object to Python, so we can manage its + lifetime from the Cython/C++ side. + """ + raise NotImplementedError + + +cdef class RecordBatchStream(FlightDataStream): + """A Flight data stream backed by RecordBatches. + + The remainder of this DoGet request will be handled in C++, + without having to acquire the GIL. + + """ + cdef: + object data_source + CIpcWriteOptions write_options + + def __init__(self, data_source, options=None): + """Create a RecordBatchStream from a data source. + + Parameters + ---------- + data_source : RecordBatchReader or Table + The data to stream to the client. + options : pyarrow.ipc.IpcWriteOptions, optional + Optional IPC options to control how to write the data. + """ + if (not isinstance(data_source, RecordBatchReader) and + not isinstance(data_source, lib.Table)): + raise TypeError("Expected RecordBatchReader or Table, " + "but got: {}".format(type(data_source))) + self.data_source = data_source + self.write_options = _get_options(options).c_options + + cdef CFlightDataStream* to_stream(self) except *: + cdef: + shared_ptr[CRecordBatchReader] reader + if isinstance(self.data_source, RecordBatchReader): + reader = ( self.data_source).reader + elif isinstance(self.data_source, lib.Table): + table = (
self.data_source).table + reader.reset(new TableBatchReader(deref(table))) + else: + raise RuntimeError("Can't construct RecordBatchStream " + "from type {}".format(type(self.data_source))) + return new CRecordBatchStream(reader, self.write_options) + + +cdef class GeneratorStream(FlightDataStream): + """A Flight data stream backed by a Python generator.""" + cdef: + shared_ptr[CSchema] schema + object generator + # A substream currently being consumed by the client, if + # present. Produced by the generator. + unique_ptr[CFlightDataStream] current_stream + CIpcWriteOptions c_options + + def __init__(self, schema, generator, options=None): + """Create a GeneratorStream from a Python generator. + + Parameters + ---------- + schema : Schema + The schema for the data to be returned. + + generator : iterator or iterable + The generator should yield other FlightDataStream objects, + Tables, RecordBatches, or RecordBatchReaders. + + options : pyarrow.ipc.IpcWriteOptions, optional + """ + self.schema = pyarrow_unwrap_schema(schema) + self.generator = iter(generator) + self.c_options = _get_options(options).c_options + + cdef CFlightDataStream* to_stream(self) except *: + cdef: + function[cb_data_stream_next] callback = &_data_stream_next + return new CPyGeneratorFlightDataStream(self, self.schema, callback, + self.c_options) + + +cdef class ServerCallContext(_Weakrefable): + """Per-call state/context.""" + cdef: + const CServerCallContext* context + + def peer_identity(self): + """Get the identity of the authenticated peer. + + May be the empty string. + """ + return tobytes(self.context.peer_identity()) + + def peer(self): + """Get the address of the peer.""" + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + return frombytes(self.context.peer(), safe=True) + + def is_cancelled(self): + """Check if the current RPC call has been canceled by the client.""" + return self.context.is_cancelled() + + def add_header(self, key, value): + """Add a response header.""" + self.context.AddHeader(tobytes(key), tobytes(value)) + + def add_trailer(self, key, value): + """Add a response trailer.""" + self.context.AddTrailer(tobytes(key), tobytes(value)) + + def get_middleware(self, key): + """ + Get a middleware instance by key. + + Returns None if the middleware was not found. + """ + cdef: + CServerMiddleware* c_middleware = \ + self.context.GetMiddleware(CPyServerMiddlewareName) + CPyServerMiddleware* middleware + vector[CTracingServerMiddlewareTraceKey] c_trace_context + if c_middleware == NULL: + c_middleware = self.context.GetMiddleware(tobytes(key)) + + if c_middleware == NULL: + return None + elif c_middleware.name() == CPyServerMiddlewareName: + middleware = c_middleware + py_middleware = <_ServerMiddlewareWrapper> middleware.py_object() + return py_middleware.middleware.get(key) + elif c_middleware.name() == CTracingServerMiddlewareName: + c_trace_context = ( c_middleware + ).GetTraceContext() + trace_context = {pair.key: pair.value for pair in c_trace_context} + return TracingServerMiddleware(trace_context) + return None + + @staticmethod + cdef ServerCallContext wrap(const CServerCallContext& context): + cdef ServerCallContext result = \ + ServerCallContext.__new__(ServerCallContext) + result.context = &context + return result + + +cdef class ServerAuthReader(_Weakrefable): + """A reader for messages from the client during an auth handshake.""" + cdef: + CServerAuthReader* reader + + def read(self): + cdef c_string token + if not self.reader: + raise ValueError("Cannot use ServerAuthReader outside " + "ServerAuthHandler.authenticate") + with nogil: + check_flight_status(self.reader.Read(&token)) + return token + + cdef void poison(self): + """Prevent further usage of this object. + + This object is constructed by taking a pointer to a reference, + so we want to make sure Python users do not access this after + the reference goes away. + """ + self.reader = NULL + + @staticmethod + cdef ServerAuthReader wrap(CServerAuthReader* reader): + cdef ServerAuthReader result = \ + ServerAuthReader.__new__(ServerAuthReader) + result.reader = reader + return result + + +cdef class ServerAuthSender(_Weakrefable): + """A writer for messages to the client during an auth handshake.""" + cdef: + CServerAuthSender* sender + + def write(self, message): + cdef c_string c_message = tobytes(message) + if not self.sender: + raise ValueError("Cannot use ServerAuthSender outside " + "ServerAuthHandler.authenticate") + with nogil: + check_flight_status(self.sender.Write(c_message)) + + cdef void poison(self): + """Prevent further usage of this object. + + This object is constructed by taking a pointer to a reference, + so we want to make sure Python users do not access this after + the reference goes away. + """ + self.sender = NULL + + @staticmethod + cdef ServerAuthSender wrap(CServerAuthSender* sender): + cdef ServerAuthSender result = \ + ServerAuthSender.__new__(ServerAuthSender) + result.sender = sender + return result + + +cdef class ClientAuthReader(_Weakrefable): + """A reader for messages from the server during an auth handshake.""" + cdef: + CClientAuthReader* reader + + def read(self): + cdef c_string token + if not self.reader: + raise ValueError("Cannot use ClientAuthReader outside " + "ClientAuthHandler.authenticate") + with nogil: + check_flight_status(self.reader.Read(&token)) + return token + + cdef void poison(self): + """Prevent further usage of this object. + + This object is constructed by taking a pointer to a reference, + so we want to make sure Python users do not access this after + the reference goes away. + """ + self.reader = NULL + + @staticmethod + cdef ClientAuthReader wrap(CClientAuthReader* reader): + cdef ClientAuthReader result = \ + ClientAuthReader.__new__(ClientAuthReader) + result.reader = reader + return result + + +cdef class ClientAuthSender(_Weakrefable): + """A writer for messages to the server during an auth handshake.""" + cdef: + CClientAuthSender* sender + + def write(self, message): + cdef c_string c_message = tobytes(message) + if not self.sender: + raise ValueError("Cannot use ClientAuthSender outside " + "ClientAuthHandler.authenticate") + with nogil: + check_flight_status(self.sender.Write(c_message)) + + cdef void poison(self): + """Prevent further usage of this object. + + This object is constructed by taking a pointer to a reference, + so we want to make sure Python users do not access this after + the reference goes away. + """ + self.sender = NULL + + @staticmethod + cdef ClientAuthSender wrap(CClientAuthSender* sender): + cdef ClientAuthSender result = \ + ClientAuthSender.__new__(ClientAuthSender) + result.sender = sender + return result + + +cdef CStatus _data_stream_next(void* self, CFlightPayload* payload) except *: + """Callback for implementing FlightDataStream in Python.""" + cdef: + unique_ptr[CFlightDataStream] data_stream + + py_stream = self + if not isinstance(py_stream, GeneratorStream): + raise RuntimeError("self object in callback is not GeneratorStream") + stream = py_stream + + # The generator is allowed to yield a reader or table which we + # yield from; if that sub-generator is empty, we need to reset and + # try again. However, limit the number of attempts so that we + # don't just spin forever. + max_attempts = 128 + for _ in range(max_attempts): + if stream.current_stream != nullptr: + with nogil: + check_flight_status( + stream.current_stream.get().Next().Value(payload)) + # If the stream ended, see if there's another stream from the + # generator + if payload.ipc_message.metadata != nullptr: + return CStatus_OK() + stream.current_stream.reset(nullptr) + + try: + result = next(stream.generator) + except StopIteration: + payload.ipc_message.metadata.reset( nullptr) + return CStatus_OK() + except FlightError as flight_error: + return ( flight_error).to_status() + + if isinstance(result, (list, tuple)): + result, metadata = result + else: + result, metadata = result, None + + if isinstance(result, (Table, RecordBatchReader)): + if metadata: + raise ValueError("Can only return metadata alongside a " + "RecordBatch.") + result = RecordBatchStream(result) + + stream_schema = pyarrow_wrap_schema(stream.schema) + if isinstance(result, FlightDataStream): + if metadata: + raise ValueError("Can only return metadata alongside a " + "RecordBatch.") + data_stream = unique_ptr[CFlightDataStream]( + ( result).to_stream()) + substream_schema = pyarrow_wrap_schema(data_stream.get().schema()) + if substream_schema != stream_schema: + raise ValueError("Got a FlightDataStream whose schema " + "does not match the declared schema of this " + "GeneratorStream. " + "Got: {}\nExpected: {}".format( + substream_schema, stream_schema)) + stream.current_stream.reset( + new CPyFlightDataStream(result, move(data_stream))) + # Loop around and try again + continue + elif isinstance(result, RecordBatch): + batch = result + if batch.schema != stream_schema: + raise ValueError("Got a RecordBatch whose schema does not " + "match the declared schema of this " + "GeneratorStream. " + "Got: {}\nExpected: {}".format(batch.schema, + stream_schema)) + check_flight_status(GetRecordBatchPayload( + deref(batch.batch), + stream.c_options, + &payload.ipc_message)) + if metadata: + payload.app_metadata = pyarrow_unwrap_buffer( + as_buffer(metadata)) + else: + raise TypeError("GeneratorStream must be initialized with " + "an iterator of FlightDataStream, Table, " + "RecordBatch, or RecordBatchStreamReader objects, " + "not {}.".format(type(result))) + # Don't loop around + return CStatus_OK() + # Ran out of attempts (the RPC handler kept yielding empty tables/readers) + raise RuntimeError("While getting next payload, ran out of attempts to " + "get something to send " + "(application server implementation error)") + + +cdef CStatus _list_flights(void* self, const CServerCallContext& context, + const CCriteria* c_criteria, + unique_ptr[CFlightListing]* listing) except *: + """Callback for implementing ListFlights in Python.""" + cdef: + vector[CFlightInfo] flights + + try: + result = ( self).list_flights(ServerCallContext.wrap(context), + c_criteria.expression) + for info in result: + if not isinstance(info, FlightInfo): + raise TypeError("FlightServerBase.list_flights must return " + "FlightInfo instances, but got {}".format( + type(info))) + flights.push_back(deref(( info).info.get())) + listing.reset(new CSimpleFlightListing(flights)) + except FlightError as flight_error: + return ( flight_error).to_status() + return CStatus_OK() + + +cdef CStatus _get_flight_info(void* self, const CServerCallContext& context, + CFlightDescriptor c_descriptor, + unique_ptr[CFlightInfo]* info) except *: + """Callback for implementing Flight servers in Python.""" + cdef: + FlightDescriptor py_descriptor = \ + FlightDescriptor.__new__(FlightDescriptor) + py_descriptor.descriptor = c_descriptor + try: + result = ( self).get_flight_info( + ServerCallContext.wrap(context), + py_descriptor) + except FlightError as flight_error: + return ( flight_error).to_status() + if not isinstance(result, FlightInfo): + raise TypeError("FlightServerBase.get_flight_info must return " + "a FlightInfo instance, but got {}".format( + type(result))) + info.reset(new CFlightInfo(deref(( result).info.get()))) + return CStatus_OK() + +cdef CStatus _get_schema(void* self, const CServerCallContext& context, + CFlightDescriptor c_descriptor, + unique_ptr[CSchemaResult]* info) except *: + """Callback for implementing Flight servers in Python.""" + cdef: + FlightDescriptor py_descriptor = \ + FlightDescriptor.__new__(FlightDescriptor) + py_descriptor.descriptor = c_descriptor + result = ( self).get_schema(ServerCallContext.wrap(context), + py_descriptor) + if not isinstance(result, SchemaResult): + raise TypeError("FlightServerBase.get_schema_info must return " + "a SchemaResult instance, but got {}".format( + type(result))) + info.reset(new CSchemaResult(deref(( result).result.get()))) + return CStatus_OK() + +cdef CStatus _do_put(void* self, const CServerCallContext& context, + unique_ptr[CFlightMessageReader] reader, + unique_ptr[CFlightMetadataWriter] writer) except *: + """Callback for implementing Flight servers in Python.""" + cdef: + MetadataRecordBatchReader py_reader = MetadataRecordBatchReader() + FlightMetadataWriter py_writer = FlightMetadataWriter() + FlightDescriptor descriptor = \ + FlightDescriptor.__new__(FlightDescriptor) + + descriptor.descriptor = reader.get().descriptor() + py_reader.reader.reset(reader.release()) + py_writer.writer.reset(writer.release()) + try: + ( self).do_put(ServerCallContext.wrap(context), descriptor, + py_reader, py_writer) + return CStatus_OK() + except FlightError as flight_error: + return ( flight_error).to_status() + + +cdef CStatus _do_get(void* self, const CServerCallContext& context, + CTicket ticket, + unique_ptr[CFlightDataStream]* stream) except *: + """Callback for implementing Flight servers in Python.""" + cdef: + unique_ptr[CFlightDataStream] data_stream + + py_ticket = Ticket(ticket.ticket) + try: + result = ( self).do_get(ServerCallContext.wrap(context), + py_ticket) + except FlightError as flight_error: + return ( flight_error).to_status() + if not isinstance(result, FlightDataStream): + raise TypeError("FlightServerBase.do_get must return " + "a FlightDataStream") + data_stream = unique_ptr[CFlightDataStream]( + ( result).to_stream()) + stream[0] = unique_ptr[CFlightDataStream]( + new CPyFlightDataStream(result, move(data_stream))) + return CStatus_OK() + + +cdef CStatus _do_exchange(void* self, const CServerCallContext& context, + unique_ptr[CFlightMessageReader] reader, + unique_ptr[CFlightMessageWriter] writer) except *: + """Callback for implementing Flight servers in Python.""" + cdef: + MetadataRecordBatchReader py_reader = MetadataRecordBatchReader() + MetadataRecordBatchWriter py_writer = MetadataRecordBatchWriter() + FlightDescriptor descriptor = \ + FlightDescriptor.__new__(FlightDescriptor) + + descriptor.descriptor = reader.get().descriptor() + py_reader.reader.reset(reader.release()) + py_writer.writer.reset(writer.release()) + try: + ( self).do_exchange(ServerCallContext.wrap(context), + descriptor, py_reader, py_writer) + return CStatus_OK() + except FlightError as flight_error: + return ( flight_error).to_status() + + +cdef CStatus _do_action_result_next( + void* self, + unique_ptr[CFlightResult]* result +) except *: + """Callback for implementing Flight servers in Python.""" + cdef: + CFlightResult* c_result + + try: + action_result = next( self) + if not isinstance(action_result, Result): + action_result = Result(action_result) + c_result = ( action_result).result.get() + result.reset(new CFlightResult(deref(c_result))) + except StopIteration: + result.reset(nullptr) + except FlightError as flight_error: + return ( flight_error).to_status() + return CStatus_OK() + + +cdef CStatus _do_action(void* self, const CServerCallContext& context, + const CAction& action, + unique_ptr[CResultStream]* result) except *: + """Callback for implementing Flight servers in Python.""" + cdef: + function[cb_result_next] ptr = &_do_action_result_next + py_action = Action(action.type, pyarrow_wrap_buffer(action.body)) + try: + responses = ( self).do_action(ServerCallContext.wrap(context), + py_action) + except FlightError as flight_error: + return ( flight_error).to_status() + # Let the application return an iterator or anything convertible + # into one + if responses is None: + # Server didn't return anything + responses = [] + result.reset(new CPyFlightResultStream(iter(responses), ptr)) + return CStatus_OK() + + +cdef CStatus _list_actions(void* self, const CServerCallContext& context, + vector[CActionType]* actions) except *: + """Callback for implementing Flight servers in Python.""" + cdef: + CActionType action_type + # Method should return a list of ActionTypes or similar tuple + try: + result = ( self).list_actions(ServerCallContext.wrap(context)) + for action in result: + if not isinstance(action, tuple): + raise TypeError( + "Results of list_actions must be ActionType or tuple") + action_type.type = tobytes(action[0]) + action_type.description = tobytes(action[1]) + actions.push_back(action_type) + except FlightError as flight_error: + return ( flight_error).to_status() + return CStatus_OK() + + +cdef CStatus _server_authenticate(void* self, CServerAuthSender* outgoing, + CServerAuthReader* incoming) except *: + """Callback for implementing authentication in Python.""" + sender = ServerAuthSender.wrap(outgoing) + reader = ServerAuthReader.wrap(incoming) + try: + ( self).authenticate(sender, reader) + except FlightError as flight_error: + return ( flight_error).to_status() + finally: + sender.poison() + reader.poison() + return CStatus_OK() + +cdef CStatus _is_valid(void* self, const c_string& token, + c_string* peer_identity) except *: + """Callback for implementing authentication in Python.""" + cdef c_string c_result + try: + c_result = tobytes(( self).is_valid(token)) + peer_identity[0] = c_result + except FlightError as flight_error: + return ( flight_error).to_status() + return CStatus_OK() + + +cdef CStatus _client_authenticate(void* self, CClientAuthSender* outgoing, + CClientAuthReader* incoming) except *: + """Callback for implementing authentication in Python.""" + sender = ClientAuthSender.wrap(outgoing) + reader = ClientAuthReader.wrap(incoming) + try: + ( self).authenticate(sender, reader) + except FlightError as flight_error: + return ( flight_error).to_status() + finally: + sender.poison() + reader.poison() + return CStatus_OK() + + +cdef CStatus _get_token(void* self, c_string* token) except *: + """Callback for implementing authentication in Python.""" + cdef c_string c_result + try: + c_result = tobytes(( self).get_token()) + token[0] = c_result + except FlightError as flight_error: + return ( flight_error).to_status() + return CStatus_OK() + + +cdef CStatus _middleware_sending_headers( + void* self, CAddCallHeaders* add_headers) except *: + """Callback for implementing middleware.""" + try: + headers = ( self).sending_headers() + except FlightError as flight_error: + return ( flight_error).to_status() + + if headers: + for header, values in headers.items(): + if isinstance(values, (str, bytes)): + values = (values,) + # Headers in gRPC (and HTTP/1, HTTP/2) are required to be + # valid, lowercase ASCII. + header = header.lower() + if isinstance(header, str): + header = header.encode("ascii") + for value in values: + if isinstance(value, str): + value = value.encode("ascii") + # Allow bytes values to pass through. + add_headers.AddHeader(header, value) + + return CStatus_OK() + + +cdef CStatus _middleware_call_completed( + void* self, + const CStatus& call_status) except *: + """Callback for implementing middleware.""" + try: + try: + check_flight_status(call_status) + except Exception as e: + ( self).call_completed(e) + else: + ( self).call_completed(None) + except FlightError as flight_error: + return ( flight_error).to_status() + return CStatus_OK() + + +cdef CStatus _middleware_received_headers( + void* self, + const CCallHeaders& c_headers) except *: + """Callback for implementing middleware.""" + try: + headers = convert_headers(c_headers) + ( self).received_headers(headers) + except FlightError as flight_error: + return ( flight_error).to_status() + return CStatus_OK() + + +cdef dict convert_headers(const CCallHeaders& c_headers): + cdef: + CCallHeaders.const_iterator header_iter = c_headers.cbegin() + headers = {} + while header_iter != c_headers.cend(): + header = c_string(deref(header_iter).first).decode("ascii") + value = c_string(deref(header_iter).second) + if not header.endswith("-bin"): + # Text header values in gRPC (and HTTP/1, HTTP/2) are + # required to be valid ASCII. Binary header values are + # exposed as bytes. + value = value.decode("ascii") + headers.setdefault(header, []).append(value) + postincrement(header_iter) + return headers + + +cdef CStatus _server_middleware_start_call( + void* self, + const CCallInfo& c_info, + const CCallHeaders& c_headers, + shared_ptr[CServerMiddleware]* c_instance) except *: + """Callback for implementing server middleware.""" + instance = None + try: + call_info = wrap_call_info(c_info) + headers = convert_headers(c_headers) + instance = ( self).start_call(call_info, headers) + except FlightError as flight_error: + return ( flight_error).to_status() + + if instance: + ServerMiddleware.wrap(instance, c_instance) + + return CStatus_OK() + + +cdef CStatus _client_middleware_start_call( + void* self, + const CCallInfo& c_info, + unique_ptr[CClientMiddleware]* c_instance) except *: + """Callback for implementing client middleware.""" + instance = None + try: + call_info = wrap_call_info(c_info) + instance = ( self).start_call(call_info) + except FlightError as flight_error: + return ( flight_error).to_status() + + if instance: + ClientMiddleware.wrap(instance, c_instance) + + return CStatus_OK() + + +cdef class ServerAuthHandler(_Weakrefable): + """Authentication middleware for a server. + + To implement an authentication mechanism, subclass this class and + override its methods. + + """ + + def authenticate(self, outgoing, incoming): + """Conduct the handshake with the client. + + May raise an error if the client cannot authenticate. + + Parameters + ---------- + outgoing : ServerAuthSender + A channel to send messages to the client. + incoming : ServerAuthReader + A channel to read messages from the client. + """ + raise NotImplementedError + + def is_valid(self, token): + """Validate a client token, returning their identity. + + May return an empty string (if the auth mechanism does not + name the peer) or raise an exception (if the token is + invalid). + + Parameters + ---------- + token : bytes + The authentication token from the client. + + """ + raise NotImplementedError + + cdef PyServerAuthHandler* to_handler(self): + cdef PyServerAuthHandlerVtable vtable + vtable.authenticate = _server_authenticate + vtable.is_valid = _is_valid + return new PyServerAuthHandler(self, vtable) + + +cdef class ClientAuthHandler(_Weakrefable): + """Authentication plugin for a client.""" + + def authenticate(self, outgoing, incoming): + """Conduct the handshake with the server. + + Parameters + ---------- + outgoing : ClientAuthSender + A channel to send messages to the server. + incoming : ClientAuthReader + A channel to read messages from the server. + """ + raise NotImplementedError + + def get_token(self): + """Get the auth token for a call.""" + raise NotImplementedError + + cdef PyClientAuthHandler* to_handler(self): + cdef PyClientAuthHandlerVtable vtable + vtable.authenticate = _client_authenticate + vtable.get_token = _get_token + return new PyClientAuthHandler(self, vtable) + + +_CallInfo = collections.namedtuple("_CallInfo", ["method"]) + + +class CallInfo(_CallInfo): + """Information about a particular RPC for Flight middleware.""" + + +cdef wrap_call_info(const CCallInfo& c_info): + method = wrap_flight_method(c_info.method) + return CallInfo(method=method) + + +cdef class ClientMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + client are accessible from the middleware itself. + + """ + + def start_call(self, info): + """Called at the start of an RPC. + + This must be thread-safe and must not raise exceptions. + + Parameters + ---------- + info : CallInfo + Information about the call. + + Returns + ------- + instance : ClientMiddleware + An instance of ClientMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + """ + + +cdef class ClientMiddleware(_Weakrefable): + """Client-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self): + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the request, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + + def received_headers(self, headers): + """A callback when headers are received. + + The default implementation does nothing. + + Parameters + ---------- + headers : dict + A dictionary of headers from the server. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + """ + + def call_completed(self, exception): + """A callback when the call finishes. + + The default implementation does nothing. + + Parameters + ---------- + exception : ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + + @staticmethod + cdef void wrap(object py_middleware, + unique_ptr[CClientMiddleware]* c_instance): + cdef PyClientMiddlewareVtable vtable + vtable.sending_headers = _middleware_sending_headers + vtable.received_headers = _middleware_received_headers + vtable.call_completed = _middleware_call_completed + c_instance[0].reset(new CPyClientMiddleware(py_middleware, vtable)) + + +cdef class ServerMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + middleware are accessible from the method itself. + + """ + + def start_call(self, info, headers): + """Called at the start of an RPC. + + This must be thread-safe. + + Parameters + ---------- + info : CallInfo + Information about the call. + headers : dict + A dictionary of headers from the client. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + Returns + ------- + instance : ServerMiddleware + An instance of ServerMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + Raises + ------ + exception : pyarrow.ArrowException + If an exception is raised, the call will be rejected with + the given error. + + """ + + +cdef class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + """A factory for tracing middleware instances. + + This enables OpenTelemetry support in Arrow (if Arrow was compiled + with OpenTelemetry support enabled). A new span will be started on + each RPC call. The TracingServerMiddleware instance can then be + retrieved within an RPC handler to get the propagated context, + which can be used to start a new span on the Python side. + + Because the Python/C++ OpenTelemetry libraries do not + interoperate, spans on the C++ side are not directly visible to + the Python side and vice versa. + + """ + + +cdef class ServerMiddleware(_Weakrefable): + """Server-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self): + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the response, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + + def call_completed(self, exception): + """A callback when the call finishes. + + Parameters + ---------- + exception : pyarrow.ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + + @staticmethod + cdef void wrap(object py_middleware, + shared_ptr[CServerMiddleware]* c_instance): + cdef PyServerMiddlewareVtable vtable + vtable.sending_headers = _middleware_sending_headers + vtable.call_completed = _middleware_call_completed + c_instance[0].reset(new CPyServerMiddleware(py_middleware, vtable)) + + +class TracingServerMiddleware(ServerMiddleware): + __slots__ = ["trace_context"] + + def __init__(self, trace_context): + self.trace_context = trace_context + + +cdef class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + """Wrapper to bundle server middleware into a single C++ one.""" + + cdef: + dict factories + + def __init__(self, dict factories): + self.factories = factories + + def start_call(self, info, headers): + instances = {} + for key, factory in self.factories.items(): + instance = factory.start_call(info, headers) + if instance: + # TODO: prevent duplicate keys + instances[key] = instance + if instances: + wrapper = _ServerMiddlewareWrapper(instances) + return wrapper + return None + + +cdef class _ServerMiddlewareWrapper(ServerMiddleware): + cdef: + dict middleware + + def __init__(self, dict middleware): + self.middleware = middleware + + def sending_headers(self): + headers = collections.defaultdict(list) + for instance in self.middleware.values(): + more_headers = instance.sending_headers() + if not more_headers: + continue + # Manually merge with existing headers (since headers are + # multi-valued) + for key, values in more_headers.items(): + # ARROW-16606 gRPC aborts given non-lowercase headers + key = key.lower() + if isinstance(values, (bytes, str)): + values = (values,) + headers[key].extend(values) + return headers + + def call_completed(self, exception): + for instance in self.middleware.values(): + instance.call_completed(exception) + + +cdef class _FlightServerFinalizer(_Weakrefable): + """ + A finalizer that shuts down the server on destruction. + + See ARROW-16597. If the server is still active at interpreter + exit, the process may segfault. + """ + + cdef: + shared_ptr[PyFlightServer] server + + def finalize(self): + cdef: + PyFlightServer* server = self.server.get() + CStatus status + if server == NULL: + return + try: + with nogil: + status = server.Shutdown() + if status.ok(): + status = server.Wait() + check_flight_status(status) + finally: + self.server.reset() + + +cdef class FlightServerBase(_Weakrefable): + """A Flight service definition. + + To start the server, create an instance of this class with an + appropriate location. The server will be running as soon as the + instance is created; it is not required to call :meth:`serve`. + + Override methods to define your Flight service. + + Parameters + ---------- + location : str, tuple or Location optional, default None + Location to serve on. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + If None is passed then the server will be started on localhost with a + system provided random port. + auth_handler : ServerAuthHandler optional, default None + An authentication mechanism to use. May be None. + tls_certificates : list optional, default None + A list of (certificate, key) pairs. + verify_client : boolean optional, default False + If True, then enable mutual TLS: require the client to present + a client certificate, and validate the certificate. + root_certificates : bytes optional, default None + If enabling mutual TLS, this specifies the PEM-encoded root + certificate used to validate client certificates. + middleware : dict optional, default None + A dictionary of :class:`ServerMiddlewareFactory` instances. The + string keys can be used to retrieve the middleware instance within + RPC handlers (see :meth:`ServerCallContext.get_middleware`). + + """ + + cdef: + shared_ptr[PyFlightServer] server + object finalizer + + def __init__(self, location=None, auth_handler=None, + tls_certificates=None, verify_client=None, + root_certificates=None, middleware=None): + self.finalizer = None + if isinstance(location, (bytes, str)): + location = Location(location) + elif isinstance(location, (tuple, type(None))): + if location is None: + location = ('localhost', 0) + host, port = location + if tls_certificates: + location = Location.for_grpc_tls(host, port) + else: + location = Location.for_grpc_tcp(host, port) + elif not isinstance(location, Location): + raise TypeError('`location` argument must be a string, tuple or a ' + 'Location instance') + self.init(location, auth_handler, tls_certificates, verify_client, + tobytes(root_certificates or b""), middleware) + + cdef init(self, Location location, ServerAuthHandler auth_handler, + list tls_certificates, c_bool verify_client, + bytes root_certificates, dict middleware): + cdef: + PyFlightServerVtable vtable = PyFlightServerVtable() + PyFlightServer* c_server + unique_ptr[CFlightServerOptions] c_options + CCertKeyPair c_cert + function[cb_server_middleware_start_call] start_call = \ + &_server_middleware_start_call + pair[c_string, shared_ptr[CServerMiddlewareFactory]] c_middleware + + c_options.reset(new CFlightServerOptions(Location.unwrap(location))) + # mTLS configuration + c_options.get().verify_client = verify_client + c_options.get().root_certificates = root_certificates + + if auth_handler: + if not isinstance(auth_handler, ServerAuthHandler): + raise TypeError("auth_handler must be a ServerAuthHandler, " + "not a '{}'".format(type(auth_handler))) + c_options.get().auth_handler.reset( + ( auth_handler).to_handler()) + + if tls_certificates: + for cert, key in tls_certificates: + c_cert.pem_cert = tobytes(cert) + c_cert.pem_key = tobytes(key) + c_options.get().tls_certificates.push_back(c_cert) + + if middleware: + non_tracing_middleware = {} + enable_tracing = None + for key, factory in middleware.items(): + if isinstance(factory, TracingServerMiddlewareFactory): + if enable_tracing is not None: + raise ValueError( + "Can only provide " + "TracingServerMiddlewareFactory once") + if tobytes(key) == CPyServerMiddlewareName: + raise ValueError(f"Middleware key cannot be {key}") + enable_tracing = key + else: + non_tracing_middleware[key] = factory + + if enable_tracing: + c_middleware.first = tobytes(enable_tracing) + c_middleware.second = MakeTracingServerMiddlewareFactory() + c_options.get().middleware.push_back(c_middleware) + + py_middleware = _ServerMiddlewareFactoryWrapper( + non_tracing_middleware) + c_middleware.first = CPyServerMiddlewareName + c_middleware.second.reset(new CPyServerMiddlewareFactory( + py_middleware, + start_call)) + c_options.get().middleware.push_back(c_middleware) + + vtable.list_flights = &_list_flights + vtable.get_flight_info = &_get_flight_info + vtable.get_schema = &_get_schema + vtable.do_put = &_do_put + vtable.do_get = &_do_get + vtable.do_exchange = &_do_exchange + vtable.list_actions = &_list_actions + vtable.do_action = &_do_action + + c_server = new PyFlightServer(self, vtable) + self.server.reset(c_server) + with nogil: + check_flight_status(c_server.Init(deref(c_options))) + cdef _FlightServerFinalizer finalizer = _FlightServerFinalizer() + finalizer.server = self.server + self.finalizer = weakref.finalize(self, finalizer.finalize) + + @property + def port(self): + """ + Get the port that this server is listening on. + + Returns a non-positive value if the operation is invalid + (e.g. init() was not called or server is listening on a domain + socket). + """ + return self.server.get().port() + + def list_flights(self, context, criteria): + """List flights available on this service. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + criteria : bytes + Filter criteria provided by the client. + + Returns + ------- + iterator of FlightInfo + + """ + raise NotImplementedError + + def get_flight_info(self, context, descriptor): + """Get information about a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + FlightInfo + + """ + raise NotImplementedError + + def get_schema(self, context, descriptor): + """Get the schema of a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + Schema + + """ + raise NotImplementedError + + def do_put(self, context, descriptor, reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter): + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : FlightMetadataWriter + A writer to send responses to the client. + + """ + raise NotImplementedError + + def do_get(self, context, ticket): + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + ticket : Ticket + The ticket for the flight. + + Returns + ------- + FlightDataStream + A stream of data to send back to the client. + + """ + raise NotImplementedError + + def do_exchange(self, context, descriptor, reader, writer): + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : MetadataRecordBatchWriter + A writer to send responses to the client. + + """ + raise NotImplementedError + + def list_actions(self, context): + """List custom actions available on this server. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + + Returns + ------- + iterator of ActionType or tuple + + """ + raise NotImplementedError + + def do_action(self, context, action): + """Execute a custom action. + + This method should return an iterator, or it should be a + generator. Applications should override this method to + implement their own behavior. The default method raises a + NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + action : Action + The action to execute. + + Returns + ------- + iterator of bytes + + """ + raise NotImplementedError + + def serve(self): + """Block until the server shuts down. + + This method only returns if shutdown() is called or a signal is + received. + """ + if self.server.get() == nullptr: + raise ValueError("run() on uninitialized FlightServerBase") + with nogil: + check_flight_status(self.server.get().ServeWithSignals()) + + def run(self): + """Block until the server shuts down. + + .. deprecated:: 0.15.0 + Use the ``FlightServer.serve`` method instead + """ + warnings.warn("The 'FlightServer.run' method is deprecated, use " + "FlightServer.serve method instead") + self.serve() + + def shutdown(self): + """Shut down the server, blocking until current requests finish. + + Do not call this directly from the implementation of a Flight + method, as then the server will block forever waiting for that + request to finish. Instead, call this method from a background + thread. + + This method should only be called once. + """ + # Must not hold the GIL: shutdown waits for pending RPCs to + # complete. Holding the GIL means Python-implemented Flight + # methods will never get to run, so this will hang + # indefinitely. + if self.server.get() == nullptr: + raise ValueError("shutdown() on uninitialized FlightServerBase") + with nogil: + check_flight_status(self.server.get().Shutdown()) + + def wait(self): + """Block until server is terminated with shutdown.""" + with nogil: + self.server.get().Wait() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.finalizer: + self.finalizer() + + +def connect(location, **kwargs): + """ + Connect to a Flight server. + + Parameters + ---------- + location : str, tuple, or Location + Location to connect to. Either a URI like "grpc://localhost:port", + a tuple of (host, port), or a Location instance. + tls_root_certs : bytes or None + PEM-encoded. + cert_chain: str or None + If provided, enables TLS mutual authentication. + private_key: str or None + If provided, enables TLS mutual authentication. + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list or None + A list of ClientMiddlewareFactory instances to apply. + write_size_limit_bytes : int or None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean or None + Disable verifying the server when using TLS. + Insecure, use with caution. + generic_options : list or None + A list of generic (string, int or string) options to pass to + the underlying transport. + + Returns + ------- + client : FlightClient + """ + return FlightClient(location, **kwargs) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..324e8014631543f59cc95457ade0079c8d52eb26 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.pxd new file mode 100644 index 0000000000000000000000000000000000000000..0df75530bbd6ec3552131e11acc5b0406627fe65 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.pxd @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow.lib import _detect_compression, frombytes, tobytes +from pyarrow.lib cimport * + + +cpdef enum FileType: + NotFound = CFileType_NotFound + Unknown = CFileType_Unknown + File = CFileType_File + Directory = CFileType_Directory + + +cdef class FileInfo(_Weakrefable): + cdef: + CFileInfo info + + @staticmethod + cdef wrap(CFileInfo info) + + cdef inline CFileInfo unwrap(self) nogil + + @staticmethod + cdef CFileInfo unwrap_safe(obj) + + +cdef class FileSelector(_Weakrefable): + cdef: + CFileSelector selector + + @staticmethod + cdef FileSelector wrap(CFileSelector selector) + + cdef inline CFileSelector unwrap(self) nogil + + +cdef class FileSystem(_Weakrefable): + cdef: + shared_ptr[CFileSystem] wrapped + CFileSystem* fs + + cdef init(self, const shared_ptr[CFileSystem]& wrapped) + + @staticmethod + cdef wrap(const shared_ptr[CFileSystem]& sp) + + cdef inline shared_ptr[CFileSystem] unwrap(self) nogil + + +cdef class LocalFileSystem(FileSystem): + cdef init(self, const shared_ptr[CFileSystem]& wrapped) + + +cdef class SubTreeFileSystem(FileSystem): + cdef: + CSubTreeFileSystem* subtreefs + + cdef init(self, const shared_ptr[CFileSystem]& wrapped) + + +cdef class _MockFileSystem(FileSystem): + cdef: + CMockFileSystem* mockfs + + cdef init(self, const shared_ptr[CFileSystem]& wrapped) + + +cdef class PyFileSystem(FileSystem): + cdef: + CPyFileSystem* pyfs + + cdef init(self, const shared_ptr[CFileSystem]& wrapped) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.pyx new file mode 100644 index 0000000000000000000000000000000000000000..dbfb6ed114553bc8435d0f61090c554e4ede5632 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_fs.pyx @@ -0,0 +1,1628 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from cpython.datetime cimport datetime, PyDateTime_DateTime +from cython cimport binding + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint +from pyarrow.lib import _detect_compression, frombytes, tobytes +from pyarrow.lib cimport * +from pyarrow.util import _stringify_path + +from abc import ABC, abstractmethod +from datetime import datetime, timezone +import os +import pathlib +import sys + + +cdef _init_ca_paths(): + cdef CFileSystemGlobalOptions options + + import ssl + paths = ssl.get_default_verify_paths() + if paths.cafile: + options.tls_ca_file_path = os.fsencode(paths.cafile) + if paths.capath: + options.tls_ca_dir_path = os.fsencode(paths.capath) + check_status(CFileSystemsInitialize(options)) + + +if sys.platform == 'linux': + # ARROW-9261: On Linux, we may need to fixup the paths to TLS CA certs + # (especially in manylinux packages) since the values hardcoded at + # compile-time in libcurl may be wrong. + _init_ca_paths() + + +cdef inline c_string _path_as_bytes(path) except *: + # handle only abstract paths, not bound to any filesystem like pathlib is, + # so we only accept plain strings + if not isinstance(path, (bytes, str)): + raise TypeError('Path must be a string') + # tobytes always uses utf-8, which is more or less ok, at least on Windows + # since the C++ side then decodes from utf-8. On Unix, os.fsencode may be + # better. + return tobytes(path) + + +cdef object _wrap_file_type(CFileType ty): + return FileType( ty) + + +cdef CFileType _unwrap_file_type(FileType ty) except *: + if ty == FileType.Unknown: + return CFileType_Unknown + elif ty == FileType.NotFound: + return CFileType_NotFound + elif ty == FileType.File: + return CFileType_File + elif ty == FileType.Directory: + return CFileType_Directory + assert 0 + + +def _file_type_to_string(ty): + # Python 3.11 changed str(IntEnum) to return the string representation + # of the integer value: https://github.com/python/cpython/issues/94763 + return f"{ty.__class__.__name__}.{ty._name_}" + + +cdef class FileInfo(_Weakrefable): + """ + FileSystem entry info. + + Parameters + ---------- + path : str + The full path to the filesystem entry. + type : FileType + The type of the filesystem entry. + mtime : datetime or float, default None + If given, the modification time of the filesystem entry. + If a float is given, it is the number of seconds since the + Unix epoch. + mtime_ns : int, default None + If given, the modification time of the filesystem entry, + in nanoseconds since the Unix epoch. + `mtime` and `mtime_ns` are mutually exclusive. + size : int, default None + If given, the filesystem entry size in bytes. This should only + be given if `type` is `FileType.File`. + + Examples + -------- + Generate a file: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> path_fs = local_path + '/pyarrow-fs-example.dat' + >>> with local.open_output_stream(path_fs) as stream: + ... stream.write(b'data') + 4 + + Get FileInfo object using ``get_file_info()``: + + >>> file_info = local.get_file_info(path_fs) + >>> file_info + + + Inspect FileInfo attributes: + + >>> file_info.type + + + >>> file_info.is_file + True + + >>> file_info.path + '/.../pyarrow-fs-example.dat' + + >>> file_info.base_name + 'pyarrow-fs-example.dat' + + >>> file_info.size + 4 + + >>> file_info.extension + 'dat' + + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + + def __init__(self, path, FileType type=FileType.Unknown, *, + mtime=None, mtime_ns=None, size=None): + self.info.set_path(tobytes(path)) + self.info.set_type(_unwrap_file_type(type)) + if mtime is not None: + if mtime_ns is not None: + raise TypeError("Only one of mtime and mtime_ns " + "can be given") + if isinstance(mtime, datetime): + self.info.set_mtime(PyDateTime_to_TimePoint( + mtime)) + else: + self.info.set_mtime(TimePoint_from_s(mtime)) + elif mtime_ns is not None: + self.info.set_mtime(TimePoint_from_ns(mtime_ns)) + if size is not None: + self.info.set_size(size) + + @staticmethod + cdef wrap(CFileInfo info): + cdef FileInfo self = FileInfo.__new__(FileInfo) + self.info = move(info) + return self + + cdef inline CFileInfo unwrap(self) nogil: + return self.info + + @staticmethod + cdef CFileInfo unwrap_safe(obj): + if not isinstance(obj, FileInfo): + raise TypeError("Expected FileInfo instance, got {0}" + .format(type(obj))) + return ( obj).unwrap() + + def __repr__(self): + def getvalue(attr): + try: + return getattr(self, attr) + except ValueError: + return '' + + s = (f'>> file_info = local.get_file_info(path) + >>> file_info.path + '/.../pyarrow-fs-example.dat' + """ + return frombytes(self.info.path()) + + @property + def base_name(self): + """ + The file base name. + + Component after the last directory separator. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.base_name + 'pyarrow-fs-example.dat' + """ + return frombytes(self.info.base_name()) + + @property + def size(self): + """ + The size in bytes, if available. + + Only regular files are guaranteed to have a size. + + Returns + ------- + size : int or None + """ + cdef int64_t size + size = self.info.size() + return (size if size != -1 else None) + + @property + def extension(self): + """ + The file extension. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.extension + 'dat' + """ + return frombytes(self.info.extension()) + + @property + def mtime(self): + """ + The time of last modification, if available. + + Returns + ------- + mtime : datetime.datetime or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + """ + cdef int64_t nanoseconds + nanoseconds = TimePoint_to_ns(self.info.mtime()) + return (datetime.fromtimestamp(nanoseconds / 1.0e9, timezone.utc) + if nanoseconds != -1 else None) + + @property + def mtime_ns(self): + """ + The time of last modification, if available, expressed in nanoseconds + since the Unix epoch. + + Returns + ------- + mtime_ns : int or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + cdef int64_t nanoseconds + nanoseconds = TimePoint_to_ns(self.info.mtime()) + return (nanoseconds if nanoseconds != -1 else None) + + +cdef class FileSelector(_Weakrefable): + """ + File and directory selector. + + It contains a set of options that describes how to search for files and + directories. + + Parameters + ---------- + base_dir : str + The directory in which to select files. Relative paths also work, use + '.' for the current directory and '..' for the parent. + allow_not_found : bool, default False + The behavior if `base_dir` doesn't exist in the filesystem. + If false, an error is returned. + If true, an empty selection is returned. + recursive : bool, default False + Whether to recurse into subdirectories. + + Examples + -------- + List the contents of a directory and subdirectories: + + >>> selector_1 = fs.FileSelector(local_path, recursive=True) + >>> local.get_file_info(selector_1) # doctest: +SKIP + [, + , + ] + + List only the contents of the base directory: + + >>> selector_2 = fs.FileSelector(local_path) + >>> local.get_file_info(selector_2) # doctest: +SKIP + [, + ] + + Return empty selection if the directory doesn't exist: + + >>> selector_not_found = fs.FileSelector(local_path + '/missing', + ... recursive=True, + ... allow_not_found=True) + >>> local.get_file_info(selector_not_found) + [] + """ + + def __init__(self, base_dir, bint allow_not_found=False, + bint recursive=False): + self.base_dir = base_dir + self.recursive = recursive + self.allow_not_found = allow_not_found + + @staticmethod + cdef FileSelector wrap(CFileSelector wrapped): + cdef FileSelector self = FileSelector.__new__(FileSelector) + self.selector = move(wrapped) + return self + + cdef inline CFileSelector unwrap(self) nogil: + return self.selector + + @property + def base_dir(self): + return frombytes(self.selector.base_dir) + + @base_dir.setter + def base_dir(self, base_dir): + self.selector.base_dir = _path_as_bytes(base_dir) + + @property + def allow_not_found(self): + return self.selector.allow_not_found + + @allow_not_found.setter + def allow_not_found(self, bint allow_not_found): + self.selector.allow_not_found = allow_not_found + + @property + def recursive(self): + return self.selector.recursive + + @recursive.setter + def recursive(self, bint recursive): + self.selector.recursive = recursive + + def __repr__(self): + return ("".format(self)) + + +cdef class FileSystem(_Weakrefable): + """ + Abstract file system API. + """ + + def __init__(self): + raise TypeError("FileSystem is an abstract class, instantiate one of " + "the subclasses instead: LocalFileSystem or " + "SubTreeFileSystem") + + @staticmethod + @binding(True) # Required for cython < 3 + def _from_uri(uri): + fs, _path = FileSystem.from_uri(uri) + return fs + + @staticmethod + def from_uri(uri): + """ + Create a new FileSystem from URI or Path. + + Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". + In addition, the argument can be a pathlib.Path object, or a string + describing an absolute local path. + + Parameters + ---------- + uri : string + URI-based path, for example: file:///some/local/path. + + Returns + ------- + tuple of (FileSystem, str path) + With (filesystem, path) tuple where path is the abstract path + inside the FileSystem instance. + + Examples + -------- + Create a new FileSystem subclass from a URI: + + >>> uri = 'file:///{}/pyarrow-fs-example.dat'.format(local_path) + >>> local_new, path_new = fs.FileSystem.from_uri(uri) + >>> local_new + >> path_new + '/.../pyarrow-fs-example.dat' + + Or from a s3 bucket: + + >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") + (, 'usgs-landsat/collection02') + """ + cdef: + c_string c_path + c_string c_uri + CResult[shared_ptr[CFileSystem]] result + + if isinstance(uri, pathlib.Path): + # Make absolute + uri = uri.resolve().absolute() + c_uri = tobytes(_stringify_path(uri)) + with nogil: + result = CFileSystemFromUriOrPath(c_uri, &c_path) + return FileSystem.wrap(GetResultValue(result)), frombytes(c_path) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + self.wrapped = wrapped + self.fs = wrapped.get() + + @staticmethod + cdef wrap(const shared_ptr[CFileSystem]& sp): + cdef FileSystem self + + typ = frombytes(sp.get().type_name()) + if typ == 'local': + self = LocalFileSystem.__new__(LocalFileSystem) + elif typ == 'mock': + self = _MockFileSystem.__new__(_MockFileSystem) + elif typ == 'subtree': + self = SubTreeFileSystem.__new__(SubTreeFileSystem) + elif typ == 's3': + from pyarrow._s3fs import S3FileSystem + self = S3FileSystem.__new__(S3FileSystem) + elif typ == 'gcs': + from pyarrow._gcsfs import GcsFileSystem + self = GcsFileSystem.__new__(GcsFileSystem) + elif typ == 'abfs': + from pyarrow._azurefs import AzureFileSystem + self = AzureFileSystem.__new__(AzureFileSystem) + elif typ == 'hdfs': + from pyarrow._hdfs import HadoopFileSystem + self = HadoopFileSystem.__new__(HadoopFileSystem) + elif typ.startswith('py::'): + self = PyFileSystem.__new__(PyFileSystem) + else: + raise TypeError('Cannot wrap FileSystem pointer') + + self.init(sp) + return self + + cdef inline shared_ptr[CFileSystem] unwrap(self) nogil: + return self.wrapped + + def equals(self, FileSystem other not None): + """ + Parameters + ---------- + other : pyarrow.fs.FileSystem + + Returns + ------- + bool + """ + return self.fs.Equals(other.unwrap()) + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + + @property + def type_name(self): + """ + The filesystem's type name. + """ + return frombytes(self.fs.type_name()) + + def get_file_info(self, paths_or_selector): + """ + Get info for the given files. + + Any symlink is automatically dereferenced, recursively. A non-existing + or unreachable file returns a FileStat object and has a FileType of + value NotFound. An exception indicates a truly exceptional condition + (low-level I/O error, etc.). + + Parameters + ---------- + paths_or_selector : FileSelector, path-like or list of path-likes + Either a selector object, a path-like object or a list of + path-like objects. The selector's base directory will not be + part of the results, even if it exists. If it doesn't exist, + use `allow_not_found`. + + Returns + ------- + FileInfo or list of FileInfo + Single FileInfo object is returned for a single path, otherwise + a list of FileInfo objects is returned. + + Examples + -------- + >>> local + + >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) + + """ + cdef: + CFileInfo info + c_string path + vector[CFileInfo] infos + vector[c_string] paths + CFileSelector selector + + if isinstance(paths_or_selector, FileSelector): + with nogil: + selector = (paths_or_selector).selector + infos = GetResultValue(self.fs.GetFileInfo(selector)) + elif isinstance(paths_or_selector, (list, tuple)): + paths = [_path_as_bytes(s) for s in paths_or_selector] + with nogil: + infos = GetResultValue(self.fs.GetFileInfo(paths)) + elif isinstance(paths_or_selector, (bytes, str)): + path =_path_as_bytes(paths_or_selector) + with nogil: + info = GetResultValue(self.fs.GetFileInfo(path)) + return FileInfo.wrap(info) + else: + raise TypeError('Must pass either path(s) or a FileSelector') + + return [FileInfo.wrap(info) for info in infos] + + def create_dir(self, path, *, bint recursive=True): + """ + Create a directory and subdirectories. + + This function succeeds if the directory already exists. + + Parameters + ---------- + path : str + The path of the new directory. + recursive : bool, default True + Create nested directories as well. + """ + cdef c_string directory = _path_as_bytes(path) + with nogil: + check_status(self.fs.CreateDir(directory, recursive=recursive)) + + def delete_dir(self, path): + """ + Delete a directory and its contents, recursively. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + """ + cdef c_string directory = _path_as_bytes(path) + with nogil: + check_status(self.fs.DeleteDir(directory)) + + def delete_dir_contents(self, path, *, + bint accept_root_dir=False, + bint missing_dir_ok=False): + """ + Delete a directory's contents, recursively. + + Like delete_dir, but doesn't delete the directory itself. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + accept_root_dir : boolean, default False + Allow deleting the root directory's contents + (if path is empty or "/") + missing_dir_ok : boolean, default False + If False then an error is raised if path does + not exist + """ + cdef c_string directory = _path_as_bytes(path) + if accept_root_dir and directory.strip(b"/") == b"": + with nogil: + check_status(self.fs.DeleteRootDirContents()) + else: + with nogil: + check_status(self.fs.DeleteDirContents(directory, + missing_dir_ok)) + + def move(self, src, dest): + """ + Move / rename a file or directory. + + If the destination exists: + - if it is a non-empty directory, an error is returned + - otherwise, if it has the same type as the source, it is replaced + - otherwise, behavior is unspecified (implementation-dependent). + + Parameters + ---------- + src : str + The path of the file or the directory to be moved. + dest : str + The destination path where the file or directory is moved to. + + Examples + -------- + Create a new folder with a file: + + >>> local.create_dir('/tmp/other_dir') + >>> local.copy_file(path,'/tmp/move_example.dat') + + Move the file: + + >>> local.move('/tmp/move_example.dat', + ... '/tmp/other_dir/move_example_2.dat') + + Inspect the file info: + + >>> local.get_file_info('/tmp/other_dir/move_example_2.dat') + + >>> local.get_file_info('/tmp/move_example.dat') + + + Delete the folder: + >>> local.delete_dir('/tmp/other_dir') + """ + cdef: + c_string source = _path_as_bytes(src) + c_string destination = _path_as_bytes(dest) + with nogil: + check_status(self.fs.Move(source, destination)) + + def copy_file(self, src, dest): + """ + Copy a file. + + If the destination exists and is a directory, an error is returned. + Otherwise, it is replaced. + + Parameters + ---------- + src : str + The path of the file to be copied from. + dest : str + The destination path where the file is copied to. + + Examples + -------- + >>> local.copy_file(path, + ... local_path + '/pyarrow-fs-example_copy.dat') + + Inspect the file info: + + >>> local.get_file_info(local_path + '/pyarrow-fs-example_copy.dat') + + >>> local.get_file_info(path) + + """ + cdef: + c_string source = _path_as_bytes(src) + c_string destination = _path_as_bytes(dest) + with nogil: + check_status(self.fs.CopyFile(source, destination)) + + def delete_file(self, path): + """ + Delete a file. + + Parameters + ---------- + path : str + The path of the file to be deleted. + """ + cdef c_string file = _path_as_bytes(path) + with nogil: + check_status(self.fs.DeleteFile(file)) + + def _wrap_input_stream(self, stream, path, compression, buffer_size): + if buffer_size is not None and buffer_size != 0: + stream = BufferedInputStream(stream, buffer_size) + if compression == 'detect': + compression = _detect_compression(path) + if compression is not None: + stream = CompressedInputStream(stream, compression) + return stream + + def _wrap_output_stream(self, stream, path, compression, buffer_size): + if buffer_size is not None and buffer_size != 0: + stream = BufferedOutputStream(stream, buffer_size) + if compression == 'detect': + compression = _detect_compression(path) + if compression is not None: + stream = CompressedOutputStream(stream, compression) + return stream + + def open_input_file(self, path): + """ + Open an input file for random access reading. + + Parameters + ---------- + path : str + The source to open for reading. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_file()`: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data' + """ + cdef: + c_string pathstr = _path_as_bytes(path) + NativeFile stream = NativeFile() + shared_ptr[CRandomAccessFile] in_handle + + with nogil: + in_handle = GetResultValue(self.fs.OpenInputFile(pathstr)) + + stream.set_random_access_file(in_handle) + stream.is_readable = True + return stream + + def open_input_stream(self, path, compression='detect', buffer_size=None): + """ + Open an input stream for sequential reading. + + Parameters + ---------- + path : str + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_stream()`: + + >>> with local.open_input_stream(path) as f: + ... print(f.readall()) + b'data' + """ + cdef: + c_string pathstr = _path_as_bytes(path) + NativeFile stream = NativeFile() + shared_ptr[CInputStream] in_handle + + with nogil: + in_handle = GetResultValue(self.fs.OpenInputStream(pathstr)) + + stream.set_input_stream(in_handle) + stream.is_readable = True + + return self._wrap_input_stream( + stream, path=path, compression=compression, buffer_size=buffer_size + ) + + def open_output_stream(self, path, compression='detect', + buffer_size=None, metadata=None): + """ + Open an output stream for sequential writing. + + If the target already exists, existing data is truncated. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream(path) as stream: + ... stream.write(b'data') + 4 + """ + cdef: + c_string pathstr = _path_as_bytes(path) + NativeFile stream = NativeFile() + shared_ptr[COutputStream] out_handle + shared_ptr[const CKeyValueMetadata] c_metadata + + if metadata is not None: + c_metadata = pyarrow_unwrap_metadata(KeyValueMetadata(metadata)) + + with nogil: + out_handle = GetResultValue( + self.fs.OpenOutputStream(pathstr, c_metadata)) + + stream.set_output_stream(out_handle) + stream.is_writable = True + + return self._wrap_output_stream( + stream, path=path, compression=compression, buffer_size=buffer_size + ) + + def open_append_stream(self, path, compression='detect', + buffer_size=None, metadata=None): + """ + Open an output stream for appending. + + If the target doesn't exist, a new empty file is created. + + .. note:: + Some filesystem implementations do not support efficient + appending to an existing file, in which case this method will + raise NotImplementedError. + Consider writing to multiple files (using e.g. the dataset layer) + instead. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Append new data to a FileSystem subclass with nonempty file: + + >>> with local.open_append_stream(path) as f: + ... f.write(b'+newly added') + 12 + + Print out the content fo the file: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data+newly added' + """ + cdef: + c_string pathstr = _path_as_bytes(path) + NativeFile stream = NativeFile() + shared_ptr[COutputStream] out_handle + shared_ptr[const CKeyValueMetadata] c_metadata + + if metadata is not None: + c_metadata = pyarrow_unwrap_metadata(KeyValueMetadata(metadata)) + + with nogil: + out_handle = GetResultValue( + self.fs.OpenAppendStream(pathstr, c_metadata)) + + stream.set_output_stream(out_handle) + stream.is_writable = True + + return self._wrap_output_stream( + stream, path=path, compression=compression, buffer_size=buffer_size + ) + + def normalize_path(self, path): + """ + Normalize filesystem path. + + Parameters + ---------- + path : str + The path to normalize + + Returns + ------- + normalized_path : str + The normalized path + """ + cdef: + c_string c_path = _path_as_bytes(path) + c_string c_path_normalized + + c_path_normalized = GetResultValue(self.fs.NormalizePath(c_path)) + return frombytes(c_path_normalized) + + +cdef class LocalFileSystem(FileSystem): + """ + A FileSystem implementation accessing files on the local machine. + + Details such as symlinks are abstracted away (symlinks are always followed, + except when deleting an entry). + + Parameters + ---------- + use_mmap : bool, default False + Whether open_input_stream and open_input_file should return + a mmap'ed file or a regular file. + + Examples + -------- + Create a FileSystem object with LocalFileSystem constructor: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> local + + + and write data on to the file: + + >>> with local.open_output_stream('/tmp/local_fs.dat') as stream: + ... stream.write(b'data') + 4 + >>> with local.open_input_stream('/tmp/local_fs.dat') as stream: + ... print(stream.readall()) + b'data' + + Create a FileSystem object inferred from a URI of the saved file: + + >>> local_new, path = fs.LocalFileSystem().from_uri('/tmp/local_fs.dat') + >>> local_new + >> path + '/tmp/local_fs.dat' + + Check if FileSystems `local` and `local_new` are equal: + + >>> local.equals(local_new) + True + + Compare two different FileSystems: + + >>> local2 = fs.LocalFileSystem(use_mmap=True) + >>> local.equals(local2) + False + + Copy a file and print out the data: + + >>> local.copy_file('/tmp/local_fs.dat', '/tmp/local_fs-copy.dat') + >>> with local.open_input_stream('/tmp/local_fs-copy.dat') as stream: + ... print(stream.readall()) + ... + b'data' + + Open an output stream for appending, add text and print the new data: + + >>> with local.open_append_stream('/tmp/local_fs-copy.dat') as f: + ... f.write(b'+newly added') + 12 + + >>> with local.open_input_stream('/tmp/local_fs-copy.dat') as f: + ... print(f.readall()) + b'data+newly added' + + Create a directory, copy a file into it and then delete the whole directory: + + >>> local.create_dir('/tmp/new_folder') + >>> local.copy_file('/tmp/local_fs.dat', '/tmp/new_folder/local_fs.dat') + >>> local.get_file_info('/tmp/new_folder') + + >>> local.delete_dir('/tmp/new_folder') + >>> local.get_file_info('/tmp/new_folder') + + + Create a directory, copy a file into it and then delete + the content of the directory: + + >>> local.create_dir('/tmp/new_folder') + >>> local.copy_file('/tmp/local_fs.dat', '/tmp/new_folder/local_fs.dat') + >>> local.get_file_info('/tmp/new_folder/local_fs.dat') + + >>> local.delete_dir_contents('/tmp/new_folder') + >>> local.get_file_info('/tmp/new_folder') + + >>> local.get_file_info('/tmp/new_folder/local_fs.dat') + + + Create a directory, copy a file into it and then delete + the file from the directory: + + >>> local.create_dir('/tmp/new_folder') + >>> local.copy_file('/tmp/local_fs.dat', '/tmp/new_folder/local_fs.dat') + >>> local.delete_file('/tmp/new_folder/local_fs.dat') + >>> local.get_file_info('/tmp/new_folder/local_fs.dat') + + >>> local.get_file_info('/tmp/new_folder') + + + Move the file: + + >>> local.move('/tmp/local_fs-copy.dat', '/tmp/new_folder/local_fs-copy.dat') + >>> local.get_file_info('/tmp/new_folder/local_fs-copy.dat') + + >>> local.get_file_info('/tmp/local_fs-copy.dat') + + + To finish delete the file left: + >>> local.delete_file('/tmp/local_fs.dat') + """ + + def __init__(self, *, use_mmap=False): + cdef: + shared_ptr[CFileSystem] fs + c_string c_uri + + # from_uri needs a non-empty path, so just use a placeholder of /_ + c_uri = tobytes(f"file:///_?use_mmap={int(use_mmap)}") + with nogil: + fs = GetResultValue(CFileSystemFromUri(c_uri)) + self.init( fs) + + def __reduce__(self): + uri = frombytes(GetResultValue(self.fs.MakeUri(b"/_"))) + return FileSystem._from_uri, (uri,) + + +cdef class SubTreeFileSystem(FileSystem): + """ + Delegates to another implementation after prepending a fixed base path. + + This is useful to expose a logical view of a subtree of a filesystem, + for example a directory in a LocalFileSystem. + + Note, that this makes no security guarantee. For example, symlinks may + allow to "escape" the subtree and access other parts of the underlying + filesystem. + + Parameters + ---------- + base_path : str + The root of the subtree. + base_fs : FileSystem + FileSystem object the operations delegated to. + + Examples + -------- + Create a LocalFileSystem instance: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream('/tmp/local_fs.dat') as stream: + ... stream.write(b'data') + 4 + + Create a directory and a SubTreeFileSystem instance: + + >>> local.create_dir('/tmp/sub_tree') + >>> subtree = fs.SubTreeFileSystem('/tmp/sub_tree', local) + + Write data into the existing file: + + >>> with subtree.open_append_stream('sub_tree_fs.dat') as f: + ... f.write(b'+newly added') + 12 + + Print out the attributes: + + >>> subtree.base_fs + + >>> subtree.base_path + '/tmp/sub_tree/' + + Get info for the given directory or given file: + + >>> subtree.get_file_info('') + + >>> subtree.get_file_info('sub_tree_fs.dat') + + + Delete the file and directory: + + >>> subtree.delete_file('sub_tree_fs.dat') + >>> local.delete_dir('/tmp/sub_tree') + >>> local.delete_file('/tmp/local_fs.dat') + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + + def __init__(self, base_path, FileSystem base_fs): + cdef: + c_string pathstr + shared_ptr[CSubTreeFileSystem] wrapped + + pathstr = _path_as_bytes(base_path) + wrapped = make_shared[CSubTreeFileSystem](pathstr, base_fs.wrapped) + + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.subtreefs = wrapped.get() + + def __repr__(self): + return ("SubTreeFileSystem(base_path={}, base_fs={}" + .format(self.base_path, self.base_fs)) + + def __reduce__(self): + return SubTreeFileSystem, ( + frombytes(self.subtreefs.base_path()), + FileSystem.wrap(self.subtreefs.base_fs()) + ) + + @property + def base_path(self): + return frombytes(self.subtreefs.base_path()) + + @property + def base_fs(self): + return FileSystem.wrap(self.subtreefs.base_fs()) + + +cdef class _MockFileSystem(FileSystem): + + def __init__(self, datetime current_time=None): + cdef shared_ptr[CMockFileSystem] wrapped + + current_time = current_time or datetime.now() + wrapped = make_shared[CMockFileSystem]( + PyDateTime_to_TimePoint( current_time) + ) + + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.mockfs = wrapped.get() + + +cdef class PyFileSystem(FileSystem): + """ + A FileSystem with behavior implemented in Python. + + Parameters + ---------- + handler : FileSystemHandler + The handler object implementing custom filesystem behavior. + + Examples + -------- + Create an fsspec-based filesystem object for GitHub: + + >>> from fsspec.implementations import github + >>> gfs = github.GithubFileSystem('apache', 'arrow') # doctest: +SKIP + + Get a PyArrow FileSystem object: + + >>> from pyarrow.fs import PyFileSystem, FSSpecHandler + >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP + + Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: + + >>> pa_fs.get_file_info('README.md') # doctest: +SKIP + + """ + + def __init__(self, handler): + cdef: + CPyFileSystemVtable vtable + shared_ptr[CPyFileSystem] wrapped + + if not isinstance(handler, FileSystemHandler): + raise TypeError("Expected a FileSystemHandler instance, got {0}" + .format(type(handler))) + + vtable.get_type_name = _cb_get_type_name + vtable.equals = _cb_equals + vtable.get_file_info = _cb_get_file_info + vtable.get_file_info_vector = _cb_get_file_info_vector + vtable.get_file_info_selector = _cb_get_file_info_selector + vtable.create_dir = _cb_create_dir + vtable.delete_dir = _cb_delete_dir + vtable.delete_dir_contents = _cb_delete_dir_contents + vtable.delete_root_dir_contents = _cb_delete_root_dir_contents + vtable.delete_file = _cb_delete_file + vtable.move = _cb_move + vtable.copy_file = _cb_copy_file + vtable.open_input_stream = _cb_open_input_stream + vtable.open_input_file = _cb_open_input_file + vtable.open_output_stream = _cb_open_output_stream + vtable.open_append_stream = _cb_open_append_stream + vtable.normalize_path = _cb_normalize_path + + wrapped = CPyFileSystem.Make(handler, move(vtable)) + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.pyfs = wrapped.get() + + @property + def handler(self): + """ + The filesystem's underlying handler. + + Returns + ------- + handler : FileSystemHandler + """ + return self.pyfs.handler() + + def __reduce__(self): + return PyFileSystem, (self.handler,) + + +class FileSystemHandler(ABC): + """ + An abstract class exposing methods to implement PyFileSystem's behavior. + """ + + @abstractmethod + def get_type_name(self): + """ + Implement PyFileSystem.type_name. + """ + + @abstractmethod + def get_file_info(self, paths): + """ + Implement PyFileSystem.get_file_info(paths). + + Parameters + ---------- + paths : list of str + paths for which we want to retrieve the info. + """ + + @abstractmethod + def get_file_info_selector(self, selector): + """ + Implement PyFileSystem.get_file_info(selector). + + Parameters + ---------- + selector : FileSelector + selector for which we want to retrieve the info. + """ + + @abstractmethod + def create_dir(self, path, recursive): + """ + Implement PyFileSystem.create_dir(...). + + Parameters + ---------- + path : str + path of the directory. + recursive : bool + if the parent directories should be created too. + """ + + @abstractmethod + def delete_dir(self, path): + """ + Implement PyFileSystem.delete_dir(...). + + Parameters + ---------- + path : str + path of the directory. + """ + + @abstractmethod + def delete_dir_contents(self, path, missing_dir_ok=False): + """ + Implement PyFileSystem.delete_dir_contents(...). + + Parameters + ---------- + path : str + path of the directory. + missing_dir_ok : bool + if False an error should be raised if path does not exist + """ + + @abstractmethod + def delete_root_dir_contents(self): + """ + Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). + """ + + @abstractmethod + def delete_file(self, path): + """ + Implement PyFileSystem.delete_file(...). + + Parameters + ---------- + path : str + path of the file. + """ + + @abstractmethod + def move(self, src, dest): + """ + Implement PyFileSystem.move(...). + + Parameters + ---------- + src : str + path of what should be moved. + dest : str + path of where it should be moved to. + """ + + @abstractmethod + def copy_file(self, src, dest): + """ + Implement PyFileSystem.copy_file(...). + + Parameters + ---------- + src : str + path of what should be copied. + dest : str + path of where it should be copied to. + """ + + @abstractmethod + def open_input_stream(self, path): + """ + Implement PyFileSystem.open_input_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + + @abstractmethod + def open_input_file(self, path): + """ + Implement PyFileSystem.open_input_file(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + + @abstractmethod + def open_output_stream(self, path, metadata): + """ + Implement PyFileSystem.open_output_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + + @abstractmethod + def open_append_stream(self, path, metadata): + """ + Implement PyFileSystem.open_append_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + + @abstractmethod + def normalize_path(self, path): + """ + Implement PyFileSystem.normalize_path(...). + + Parameters + ---------- + path : str + path of what should be normalized. + """ + +# Callback definitions for CPyFileSystemVtable + + +cdef void _cb_get_type_name(handler, c_string* out) except *: + out[0] = tobytes("py::" + handler.get_type_name()) + +cdef c_bool _cb_equals(handler, const CFileSystem& c_other) except False: + if c_other.type_name().startswith(b"py::"): + return ( c_other).handler() == handler + + return False + +cdef void _cb_get_file_info(handler, const c_string& path, + CFileInfo* out) except *: + infos = handler.get_file_info([frombytes(path)]) + if not isinstance(infos, list) or len(infos) != 1: + raise TypeError("get_file_info should have returned a 1-element list") + out[0] = FileInfo.unwrap_safe(infos[0]) + +cdef void _cb_get_file_info_vector(handler, const vector[c_string]& paths, + vector[CFileInfo]* out) except *: + py_paths = [frombytes(paths[i]) for i in range(len(paths))] + infos = handler.get_file_info(py_paths) + if not isinstance(infos, list): + raise TypeError("get_file_info should have returned a list") + out[0].clear() + out[0].reserve(len(infos)) + for info in infos: + out[0].push_back(FileInfo.unwrap_safe(info)) + +cdef void _cb_get_file_info_selector(handler, const CFileSelector& selector, + vector[CFileInfo]* out) except *: + infos = handler.get_file_info_selector(FileSelector.wrap(selector)) + if not isinstance(infos, list): + raise TypeError("get_file_info_selector should have returned a list") + out[0].clear() + out[0].reserve(len(infos)) + for info in infos: + out[0].push_back(FileInfo.unwrap_safe(info)) + +cdef void _cb_create_dir(handler, const c_string& path, + c_bool recursive) except *: + handler.create_dir(frombytes(path), recursive) + +cdef void _cb_delete_dir(handler, const c_string& path) except *: + handler.delete_dir(frombytes(path)) + +cdef void _cb_delete_dir_contents(handler, const c_string& path, + c_bool missing_dir_ok) except *: + handler.delete_dir_contents(frombytes(path), missing_dir_ok) + +cdef void _cb_delete_root_dir_contents(handler) except *: + handler.delete_root_dir_contents() + +cdef void _cb_delete_file(handler, const c_string& path) except *: + handler.delete_file(frombytes(path)) + +cdef void _cb_move(handler, const c_string& src, + const c_string& dest) except *: + handler.move(frombytes(src), frombytes(dest)) + +cdef void _cb_copy_file(handler, const c_string& src, + const c_string& dest) except *: + handler.copy_file(frombytes(src), frombytes(dest)) + +cdef void _cb_open_input_stream(handler, const c_string& path, + shared_ptr[CInputStream]* out) except *: + stream = handler.open_input_stream(frombytes(path)) + if not isinstance(stream, NativeFile): + raise TypeError("open_input_stream should have returned " + "a PyArrow file") + out[0] = ( stream).get_input_stream() + +cdef void _cb_open_input_file(handler, const c_string& path, + shared_ptr[CRandomAccessFile]* out) except *: + stream = handler.open_input_file(frombytes(path)) + if not isinstance(stream, NativeFile): + raise TypeError("open_input_file should have returned " + "a PyArrow file") + out[0] = ( stream).get_random_access_file() + +cdef void _cb_open_output_stream( + handler, const c_string& path, + const shared_ptr[const CKeyValueMetadata]& metadata, + shared_ptr[COutputStream]* out) except *: + stream = handler.open_output_stream( + frombytes(path), pyarrow_wrap_metadata(metadata)) + if not isinstance(stream, NativeFile): + raise TypeError("open_output_stream should have returned " + "a PyArrow file") + out[0] = ( stream).get_output_stream() + +cdef void _cb_open_append_stream( + handler, const c_string& path, + const shared_ptr[const CKeyValueMetadata]& metadata, + shared_ptr[COutputStream]* out) except *: + stream = handler.open_append_stream( + frombytes(path), pyarrow_wrap_metadata(metadata)) + if not isinstance(stream, NativeFile): + raise TypeError("open_append_stream should have returned " + "a PyArrow file") + out[0] = ( stream).get_output_stream() + +cdef void _cb_normalize_path(handler, const c_string& path, + c_string* out) except *: + out[0] = tobytes(handler.normalize_path(frombytes(path))) + + +def _copy_files(FileSystem source_fs, str source_path, + FileSystem destination_fs, str destination_path, + int64_t chunk_size, c_bool use_threads): + # low-level helper exposed through pyarrow/fs.py::copy_files + cdef: + CFileLocator c_source + vector[CFileLocator] c_sources + CFileLocator c_destination + vector[CFileLocator] c_destinations + + c_source.filesystem = source_fs.unwrap() + c_source.path = tobytes(source_path) + c_sources.push_back(c_source) + + c_destination.filesystem = destination_fs.unwrap() + c_destination.path = tobytes(destination_path) + c_destinations.push_back(c_destination) + + with nogil: + check_status(CCopyFiles( + c_sources, c_destinations, + c_default_io_context(), chunk_size, use_threads, + )) + + +def _copy_files_selector(FileSystem source_fs, FileSelector source_sel, + FileSystem destination_fs, str destination_base_dir, + int64_t chunk_size, c_bool use_threads): + # low-level helper exposed through pyarrow/fs.py::copy_files + cdef c_string c_destination_base_dir = tobytes(destination_base_dir) + + with nogil: + check_status(CCopyFilesWithSelector( + source_fs.unwrap(), source_sel.unwrap(), + destination_fs.unwrap(), c_destination_base_dir, + c_default_io_context(), chunk_size, use_threads, + )) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_gcsfs.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_gcsfs.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..66e07e1b02f6d4fce6afe43891f65bc87ea219f1 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_gcsfs.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_gcsfs.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_gcsfs.pyx new file mode 100644 index 0000000000000000000000000000000000000000..5e69413cea953639e36ba5485cb383b88193748b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_gcsfs.pyx @@ -0,0 +1,212 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from cython cimport binding + +from pyarrow.lib cimport (pyarrow_wrap_metadata, + pyarrow_unwrap_metadata) +from pyarrow.lib import frombytes, tobytes, ensure_metadata +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint + +from datetime import datetime, timedelta, timezone + + +cdef class GcsFileSystem(FileSystem): + """ + Google Cloud Storage (GCS) backed FileSystem implementation + + By default uses the process described in https://google.aip.dev/auth/4110 + to resolve credentials. If not running on Google Cloud Platform (GCP), + this generally requires the environment variable + GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file + containing credentials. + + Note: GCS buckets are special and the operations available on them may be + limited or more expensive than expected compared to local file systems. + + Note: When pickling a GcsFileSystem that uses default credentials, resolution + credentials are not stored in the serialized data. Therefore, when unpickling + it is assumed that the necessary credentials are in place for the target + process. + + Parameters + ---------- + anonymous : boolean, default False + Whether to connect anonymously. + If true, will not attempt to look up credentials using standard GCP + configuration methods. + access_token : str, default None + GCP access token. If provided, temporary credentials will be fetched by + assuming this role; also, a `credential_token_expiration` must be + specified as well. + target_service_account : str, default None + An optional service account to try to impersonate when accessing GCS. This + requires the specified credential user or service account to have the necessary + permissions. + credential_token_expiration : datetime, default None + Expiration for credential generated with an access token. Must be specified + if `access_token` is specified. + default_bucket_location : str, default 'US' + GCP region to create buckets in. + scheme : str, default 'https' + GCS connection transport scheme. + endpoint_override : str, default None + Override endpoint with a connect string such as "localhost:9000" + default_metadata : mapping or pyarrow.KeyValueMetadata, default None + Default metadata for `open_output_stream`. This will be ignored if + non-empty metadata is passed to `open_output_stream`. + retry_time_limit : timedelta, default None + Set the maximum amount of time the GCS client will attempt to retry + transient errors. Subsecond granularity is ignored. + project_id : str, default None + The GCP project identifier to use for creating buckets. + If not set, the library uses the GOOGLE_CLOUD_PROJECT environment + variable. Most I/O operations do not need a project id, only applications + that create new buckets need a project id. + """ + + cdef: + CGcsFileSystem* gcsfs + + def __init__(self, *, bint anonymous=False, access_token=None, + target_service_account=None, credential_token_expiration=None, + default_bucket_location='US', + scheme=None, + endpoint_override=None, + default_metadata=None, + retry_time_limit=None, + project_id=None): + cdef: + CGcsOptions options + shared_ptr[CGcsFileSystem] wrapped + double time_limit_seconds + + # Intentional use of truthiness because empty strings aren't valid and + # for reconstruction from pickling will give empty strings. + if anonymous and (target_service_account or access_token): + raise ValueError( + 'anonymous option is not compatible with target_service_account and ' + 'access_token' + ) + elif bool(access_token) != bool(credential_token_expiration): + raise ValueError( + 'access_token and credential_token_expiration must be ' + 'specified together' + ) + + elif anonymous: + options = CGcsOptions.Anonymous() + elif access_token: + if not isinstance(credential_token_expiration, datetime): + raise ValueError( + "credential_token_expiration must be a datetime") + options = CGcsOptions.FromAccessToken( + tobytes(access_token), + PyDateTime_to_TimePoint(credential_token_expiration)) + else: + options = CGcsOptions.Defaults() + + # Target service account requires base credentials so + # it is not part of the if/else chain above which only + # handles base credentials. + if target_service_account: + options = CGcsOptions.FromImpersonatedServiceAccount( + options.credentials, tobytes(target_service_account)) + + options.default_bucket_location = tobytes(default_bucket_location) + + if scheme is not None: + options.scheme = tobytes(scheme) + if endpoint_override is not None: + options.endpoint_override = tobytes(endpoint_override) + if default_metadata is not None: + options.default_metadata = pyarrow_unwrap_metadata( + ensure_metadata(default_metadata)) + if retry_time_limit is not None: + time_limit_seconds = retry_time_limit.total_seconds() + options.retry_limit_seconds = time_limit_seconds + if project_id is not None: + options.project_id = tobytes(project_id) + + with nogil: + wrapped = GetResultValue(CGcsFileSystem.Make(options)) + + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.gcsfs = wrapped.get() + + def _expiration_datetime_from_options(self): + expiration_ns = TimePoint_to_ns( + self.gcsfs.options().credentials.expiration()) + if expiration_ns == 0: + return None + return datetime.fromtimestamp(expiration_ns / 1.0e9, timezone.utc) + + @staticmethod + @binding(True) # Required for cython < 3 + def _reconstruct(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + return GcsFileSystem(**kwargs) + + def __reduce__(self): + cdef CGcsOptions opts = self.gcsfs.options() + service_account = frombytes(opts.credentials.target_service_account()) + expiration_dt = self._expiration_datetime_from_options() + retry_time_limit = None + if opts.retry_limit_seconds.has_value(): + retry_time_limit = timedelta( + seconds=opts.retry_limit_seconds.value()) + project_id = None + if opts.project_id.has_value(): + project_id = frombytes(opts.project_id.value()) + return ( + GcsFileSystem._reconstruct, (dict( + access_token=frombytes(opts.credentials.access_token()), + anonymous=opts.credentials.anonymous(), + credential_token_expiration=expiration_dt, + target_service_account=service_account, + scheme=frombytes(opts.scheme), + endpoint_override=frombytes(opts.endpoint_override), + default_bucket_location=frombytes( + opts.default_bucket_location), + default_metadata=pyarrow_wrap_metadata(opts.default_metadata), + retry_time_limit=retry_time_limit, + project_id=project_id + ),)) + + @property + def default_bucket_location(self): + """ + The GCP location this filesystem will write to. + """ + return frombytes(self.gcsfs.options().default_bucket_location) + + @property + def project_id(self): + """ + The GCP project id this filesystem will use. + """ + if self.gcsfs.options().project_id.has_value(): + return frombytes(self.gcsfs.options().project_id.value()) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_generated_version.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_generated_version.py new file mode 100644 index 0000000000000000000000000000000000000000..80fef85b3eac6ce28f9648c66c84aeffda9db86c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_generated_version.py @@ -0,0 +1,16 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple, Union + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = '18.0.0' +__version_tuple__ = version_tuple = (18, 0, 0) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_hdfs.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_hdfs.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..69aa0b9cbac75ad4f58c85c2259adfe82a83661d Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_hdfs.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_hdfs.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_hdfs.pyx new file mode 100644 index 0000000000000000000000000000000000000000..c426337a12ec184feb2d699e1e685228c249466e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_hdfs.pyx @@ -0,0 +1,160 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from cython cimport binding + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow._fs cimport FileSystem + +from pyarrow.lib import frombytes, tobytes +from pyarrow.util import _stringify_path + + +cdef class HadoopFileSystem(FileSystem): + """ + HDFS backed FileSystem implementation + + Parameters + ---------- + host : str + HDFS host to connect to. Set to "default" for fs.defaultFS from + core-site.xml. + port : int, default 8020 + HDFS port to connect to. Set to 0 for default or logical (HA) nodes. + user : str, default None + Username when connecting to HDFS; None implies login user. + replication : int, default 3 + Number of copies each block will have. + buffer_size : int, default 0 + If 0, no buffering will happen otherwise the size of the temporary read + and write buffer. + default_block_size : int, default None + None means the default configuration for HDFS, a typical block size is + 128 MB. + kerb_ticket : string or path, default None + If not None, the path to the Kerberos ticket cache. + extra_conf : dict, default None + Extra key/value pairs for configuration; will override any + hdfs-site.xml properties. + + Examples + -------- + >>> from pyarrow import fs + >>> hdfs = fs.HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) # doctest: +SKIP + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + + cdef: + CHadoopFileSystem* hdfs + + def __init__(self, str host, int port=8020, *, str user=None, + int replication=3, int buffer_size=0, + default_block_size=None, kerb_ticket=None, + extra_conf=None): + cdef: + CHdfsOptions options + shared_ptr[CHadoopFileSystem] wrapped + + if not host.startswith(('hdfs://', 'viewfs://')) and host != "default": + # TODO(kszucs): do more sanitization + host = 'hdfs://{}'.format(host) + + options.ConfigureEndPoint(tobytes(host), int(port)) + options.ConfigureReplication(replication) + options.ConfigureBufferSize(buffer_size) + + if user is not None: + options.ConfigureUser(tobytes(user)) + if default_block_size is not None: + options.ConfigureBlockSize(default_block_size) + if kerb_ticket is not None: + options.ConfigureKerberosTicketCachePath( + tobytes(_stringify_path(kerb_ticket))) + if extra_conf is not None: + for k, v in extra_conf.items(): + options.ConfigureExtraConf(tobytes(k), tobytes(v)) + + with nogil: + wrapped = GetResultValue(CHadoopFileSystem.Make(options)) + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.hdfs = wrapped.get() + + @staticmethod + def from_uri(uri): + """ + Instantiate HadoopFileSystem object from an URI string. + + The following two calls are equivalent + + * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ +&replication=1')`` + * ``HadoopFileSystem('localhost', port=8020, user='test', \ +replication=1)`` + + Parameters + ---------- + uri : str + A string URI describing the connection to HDFS. + In order to change the user, replication, buffer_size or + default_block_size pass the values as query parts. + + Returns + ------- + HadoopFileSystem + """ + cdef: + HadoopFileSystem self = HadoopFileSystem.__new__(HadoopFileSystem) + shared_ptr[CHadoopFileSystem] wrapped + CHdfsOptions options + + options = GetResultValue(CHdfsOptions.FromUriString(tobytes(uri))) + with nogil: + wrapped = GetResultValue(CHadoopFileSystem.Make(options)) + + self.init( wrapped) + return self + + @staticmethod + @binding(True) # Required for cython < 3 + def _reconstruct(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + return HadoopFileSystem(**kwargs) + + def __reduce__(self): + cdef CHdfsOptions opts = self.hdfs.options() + return ( + HadoopFileSystem._reconstruct, (dict( + host=frombytes(opts.connection_config.host), + port=opts.connection_config.port, + user=frombytes(opts.connection_config.user), + replication=opts.replication, + buffer_size=opts.buffer_size, + default_block_size=opts.default_block_size, + kerb_ticket=frombytes(opts.connection_config.kerb_ticket), + extra_conf={frombytes(k): frombytes(v) + for k, v in opts.connection_config.extra_conf}, + ),) + ) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..b1bd1c9c591c05275fc4caab85cc20360295be85 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.pxd new file mode 100644 index 0000000000000000000000000000000000000000..42a0a678a9b6a543c657c905f3eb4fa4490b6edf --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.pxd @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.includes.libarrow cimport * +from pyarrow.lib cimport _Weakrefable + + +cdef class ParseOptions(_Weakrefable): + cdef: + CJSONParseOptions options + + @staticmethod + cdef ParseOptions wrap(CJSONParseOptions options) + +cdef class ReadOptions(_Weakrefable): + cdef: + CJSONReadOptions options + + @staticmethod + cdef ReadOptions wrap(CJSONReadOptions options) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.pyx new file mode 100644 index 0000000000000000000000000000000000000000..d36dad67abbaa575d8963273c884dd9e8f047b13 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_json.pyx @@ -0,0 +1,310 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: language_level = 3 + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.lib cimport (_Weakrefable, MemoryPool, + maybe_unbox_memory_pool, + get_input_stream, pyarrow_wrap_table, + pyarrow_wrap_schema, pyarrow_unwrap_schema) + + +cdef class ReadOptions(_Weakrefable): + """ + Options for reading JSON files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual chunks in the Table. + """ + + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, use_threads=None, block_size=None): + self.options = CJSONReadOptions.Defaults() + if use_threads is not None: + self.use_threads = use_threads + if block_size is not None: + self.block_size = block_size + + @property + def use_threads(self): + """ + Whether to use multiple threads to accelerate reading. + """ + return self.options.use_threads + + @use_threads.setter + def use_threads(self, value): + self.options.use_threads = value + + @property + def block_size(self): + """ + How much bytes to process at a time from the input stream. + + This will determine multi-threading granularity as well as the size of + individual chunks in the Table. + """ + return self.options.block_size + + @block_size.setter + def block_size(self, value): + self.options.block_size = value + + def __reduce__(self): + return ReadOptions, ( + self.use_threads, + self.block_size + ) + + def equals(self, ReadOptions other): + """ + Parameters + ---------- + other : pyarrow.json.ReadOptions + + Returns + ------- + bool + """ + return ( + self.use_threads == other.use_threads and + self.block_size == other.block_size + ) + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False + + @staticmethod + cdef ReadOptions wrap(CJSONReadOptions options): + out = ReadOptions() + out.options = options # shallow copy + return out + + +cdef class ParseOptions(_Weakrefable): + """ + Options for parsing JSON files. + + Parameters + ---------- + explicit_schema : Schema, optional (default None) + Optional explicit schema (no type inference, ignores other fields). + newlines_in_values : bool, optional (default False) + Whether objects may be printed across multiple lines (for example + pretty printed). If false, input must end with an empty line. + unexpected_field_behavior : str, default "infer" + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + """ + + __slots__ = () + + def __init__(self, explicit_schema=None, newlines_in_values=None, + unexpected_field_behavior=None): + self.options = CJSONParseOptions.Defaults() + if explicit_schema is not None: + self.explicit_schema = explicit_schema + if newlines_in_values is not None: + self.newlines_in_values = newlines_in_values + if unexpected_field_behavior is not None: + self.unexpected_field_behavior = unexpected_field_behavior + + def __reduce__(self): + return ParseOptions, ( + self.explicit_schema, + self.newlines_in_values, + self.unexpected_field_behavior + ) + + @property + def explicit_schema(self): + """ + Optional explicit schema (no type inference, ignores other fields) + """ + if self.options.explicit_schema.get() == NULL: + return None + else: + return pyarrow_wrap_schema(self.options.explicit_schema) + + @explicit_schema.setter + def explicit_schema(self, value): + self.options.explicit_schema = pyarrow_unwrap_schema(value) + + @property + def newlines_in_values(self): + """ + Whether newline characters are allowed in JSON values. + Setting this to True reduces the performance of multi-threaded + JSON reading. + """ + return self.options.newlines_in_values + + @newlines_in_values.setter + def newlines_in_values(self, value): + self.options.newlines_in_values = value + + @property + def unexpected_field_behavior(self): + """ + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + + Set to "infer" by default. + """ + v = self.options.unexpected_field_behavior + if v == CUnexpectedFieldBehavior_Ignore: + return "ignore" + elif v == CUnexpectedFieldBehavior_Error: + return "error" + elif v == CUnexpectedFieldBehavior_InferType: + return "infer" + else: + raise ValueError('Unexpected value for unexpected_field_behavior') + + @unexpected_field_behavior.setter + def unexpected_field_behavior(self, value): + cdef CUnexpectedFieldBehavior v + + if value == "ignore": + v = CUnexpectedFieldBehavior_Ignore + elif value == "error": + v = CUnexpectedFieldBehavior_Error + elif value == "infer": + v = CUnexpectedFieldBehavior_InferType + else: + raise ValueError( + "Unexpected value `{}` for `unexpected_field_behavior`, pass " + "either `ignore`, `error` or `infer`.".format(value) + ) + + self.options.unexpected_field_behavior = v + + def equals(self, ParseOptions other): + """ + Parameters + ---------- + other : pyarrow.json.ParseOptions + + Returns + ------- + bool + """ + return ( + self.explicit_schema == other.explicit_schema and + self.newlines_in_values == other.newlines_in_values and + self.unexpected_field_behavior == other.unexpected_field_behavior + ) + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False + + @staticmethod + cdef ParseOptions wrap(CJSONParseOptions options): + out = ParseOptions() + out.options = options # shallow copy + return out + + +cdef _get_reader(input_file, shared_ptr[CInputStream]* out): + use_memory_map = False + get_input_stream(input_file, use_memory_map, out) + +cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out): + if read_options is None: + out[0] = CJSONReadOptions.Defaults() + else: + out[0] = read_options.options + +cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out): + if parse_options is None: + out[0] = CJSONParseOptions.Defaults() + else: + out[0] = parse_options.options + + +def read_json(input_file, read_options=None, parse_options=None, + MemoryPool memory_pool=None): + """ + Read a Table from a stream of JSON data. + + Parameters + ---------- + input_file : str, path or file-like object + The location of JSON data. Currently only the line-delimited JSON + format is supported. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see ReadOptions constructor for defaults). + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see ParseOptions constructor for defaults). + memory_pool : MemoryPool, optional + Pool to allocate Table memory from. + + Returns + ------- + :class:`pyarrow.Table` + Contents of the JSON file as a in-memory table. + """ + cdef: + shared_ptr[CInputStream] stream + CJSONReadOptions c_read_options + CJSONParseOptions c_parse_options + shared_ptr[CJSONReader] reader + shared_ptr[CTable] table + + _get_reader(input_file, &stream) + _get_read_options(read_options, &c_read_options) + _get_parse_options(parse_options, &c_parse_options) + + reader = GetResultValue( + CJSONReader.Make(maybe_unbox_memory_pool(memory_pool), + stream, c_read_options, c_parse_options)) + + with nogil: + table = GetResultValue(reader.get().Read()) + + return pyarrow_wrap_table(table) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..b20d5b2936b1229e1adbd85b22fb4bbc7f9d0ca6 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.pxd new file mode 100644 index 0000000000000000000000000000000000000000..aecbba317aecd1b331261ca600058e30e0c4f184 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.pxd @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ +# cython: language_level = 3 + +from libcpp cimport bool as c_bool +from libc.string cimport const_char +from libcpp.vector cimport vector as std_vector +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, + CResult, CTable, CMemoryPool, + CKeyValueMetadata, + CRecordBatch, + CTable, CCompressionType, + CRandomAccessFile, COutputStream, + TimeUnit) + +cdef extern from "arrow/adapters/orc/options.h" \ + namespace "arrow::adapters::orc" nogil: + cdef enum CompressionStrategy \ + " arrow::adapters::orc::CompressionStrategy": + _CompressionStrategy_SPEED \ + " arrow::adapters::orc::CompressionStrategy::kSpeed" + _CompressionStrategy_COMPRESSION \ + " arrow::adapters::orc::CompressionStrategy::kCompression" + + cdef enum WriterId" arrow::adapters::orc::WriterId": + _WriterId_ORC_JAVA_WRITER" arrow::adapters::orc::WriterId::kOrcJava" + _WriterId_ORC_CPP_WRITER" arrow::adapters::orc::WriterId::kOrcCpp" + _WriterId_PRESTO_WRITER" arrow::adapters::orc::WriterId::kPresto" + _WriterId_SCRITCHLEY_GO \ + " arrow::adapters::orc::WriterId::kScritchleyGo" + _WriterId_TRINO_WRITER" arrow::adapters::orc::WriterId::kTrino" + _WriterId_UNKNOWN_WRITER" arrow::adapters::orc::WriterId::kUnknown" + + cdef enum WriterVersion" arrow::adapters::orc::WriterVersion": + _WriterVersion_ORIGINAL \ + " arrow::adapters::orc::WriterVersion::kOriginal" + _WriterVersion_HIVE_8732 \ + " arrow::adapters::orc::WriterVersion::kHive8732" + _WriterVersion_HIVE_4243 \ + " arrow::adapters::orc::WriterVersion::kHive4243" + _WriterVersion_HIVE_12055 \ + " arrow::adapters::orc::WriterVersion::kHive12055" + _WriterVersion_HIVE_13083 \ + " arrow::adapters::orc::WriterVersion::kHive13083" + _WriterVersion_ORC_101" arrow::adapters::orc::WriterVersion::kOrc101" + _WriterVersion_ORC_135" arrow::adapters::orc::WriterVersion::kOrc135" + _WriterVersion_ORC_517" arrow::adapters::orc::WriterVersion::kOrc517" + _WriterVersion_ORC_203" arrow::adapters::orc::WriterVersion::kOrc203" + _WriterVersion_ORC_14" arrow::adapters::orc::WriterVersion::kOrc14" + _WriterVersion_MAX" arrow::adapters::orc::WriterVersion::kMax" + + cdef cppclass FileVersion" arrow::adapters::orc::FileVersion": + FileVersion(uint32_t major_version, uint32_t minor_version) + uint32_t major_version() + uint32_t minor_version() + c_string ToString() + + cdef struct WriteOptions" arrow::adapters::orc::WriteOptions": + int64_t batch_size + FileVersion file_version + int64_t stripe_size + CCompressionType compression + int64_t compression_block_size + CompressionStrategy compression_strategy + int64_t row_index_stride + double padding_tolerance + double dictionary_key_size_threshold + std_vector[int64_t] bloom_filter_columns + double bloom_filter_fpp + + +cdef extern from "arrow/adapters/orc/adapter.h" \ + namespace "arrow::adapters::orc" nogil: + + cdef cppclass ORCFileReader: + @staticmethod + CResult[unique_ptr[ORCFileReader]] Open( + const shared_ptr[CRandomAccessFile]& file, + CMemoryPool* pool) + + CResult[shared_ptr[const CKeyValueMetadata]] ReadMetadata() + + CResult[shared_ptr[CSchema]] ReadSchema() + + CResult[shared_ptr[CRecordBatch]] ReadStripe(int64_t stripe) + CResult[shared_ptr[CRecordBatch]] ReadStripe( + int64_t stripe, std_vector[c_string]) + + CResult[shared_ptr[CTable]] Read() + CResult[shared_ptr[CTable]] Read(std_vector[c_string]) + + int64_t NumberOfStripes() + int64_t NumberOfRows() + FileVersion GetFileVersion() + c_string GetSoftwareVersion() + CResult[CCompressionType] GetCompression() + int64_t GetCompressionSize() + int64_t GetRowIndexStride() + WriterId GetWriterId() + int32_t GetWriterIdValue() + WriterVersion GetWriterVersion() + int64_t GetNumberOfStripeStatistics() + int64_t GetContentLength() + int64_t GetStripeStatisticsLength() + int64_t GetFileFooterLength() + int64_t GetFilePostscriptLength() + int64_t GetFileLength() + c_string GetSerializedFileTail() + + cdef cppclass ORCFileWriter: + @staticmethod + CResult[unique_ptr[ORCFileWriter]] Open( + COutputStream* output_stream, const WriteOptions& writer_options) + + CStatus Write(const CTable& table) + + CStatus Close() diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.pyx new file mode 100644 index 0000000000000000000000000000000000000000..1dd6848122c2d4d5d2a40faf70bbb4647329f9d8 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_orc.pyx @@ -0,0 +1,445 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ + +from cython.operator cimport dereference as deref +from libcpp.vector cimport vector as std_vector +from libcpp.utility cimport move +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.lib cimport (check_status, _Weakrefable, + MemoryPool, maybe_unbox_memory_pool, + pyarrow_wrap_schema, + pyarrow_wrap_batch, + Table, + pyarrow_wrap_table, + pyarrow_wrap_metadata, + pyarrow_unwrap_table, + get_reader, + get_writer) +from pyarrow.lib import frombytes, tobytes +from pyarrow.util import _stringify_path + + +cdef compression_type_from_enum(CCompressionType compression_type): + compression_map = { + CCompressionType_UNCOMPRESSED: 'UNCOMPRESSED', + CCompressionType_GZIP: 'ZLIB', + CCompressionType_SNAPPY: 'SNAPPY', + CCompressionType_LZ4: 'LZ4', + CCompressionType_ZSTD: 'ZSTD', + } + if compression_type in compression_map: + return compression_map[compression_type] + raise ValueError('Unsupported compression') + + +cdef CCompressionType compression_type_from_name(name) except *: + if not isinstance(name, str): + raise TypeError('compression must be a string') + name = name.upper() + if name == 'ZLIB': + return CCompressionType_GZIP + elif name == 'SNAPPY': + return CCompressionType_SNAPPY + elif name == 'LZ4': + return CCompressionType_LZ4 + elif name == 'ZSTD': + return CCompressionType_ZSTD + elif name == 'UNCOMPRESSED': + return CCompressionType_UNCOMPRESSED + raise ValueError(f'Unknown CompressionKind: {name}') + + +cdef compression_strategy_from_enum( + CompressionStrategy compression_strategy +): + compression_strategy_map = { + _CompressionStrategy_SPEED: 'SPEED', + _CompressionStrategy_COMPRESSION: 'COMPRESSION', + } + if compression_strategy in compression_strategy_map: + return compression_strategy_map[compression_strategy] + raise ValueError('Unsupported compression strategy') + + +cdef CompressionStrategy compression_strategy_from_name(name) except *: + if not isinstance(name, str): + raise TypeError('compression strategy must be a string') + name = name.upper() + if name == 'COMPRESSION': + return _CompressionStrategy_COMPRESSION + elif name == 'SPEED': + return _CompressionStrategy_SPEED + raise ValueError(f'Unknown CompressionStrategy: {name}') + + +cdef file_version_from_class(FileVersion file_version): + return frombytes(file_version.ToString()) + + +cdef writer_id_from_enum(WriterId writer_id): + writer_id_map = { + _WriterId_ORC_JAVA_WRITER: 'ORC_JAVA', + _WriterId_ORC_CPP_WRITER: 'ORC_CPP', + _WriterId_PRESTO_WRITER: 'PRESTO', + _WriterId_SCRITCHLEY_GO: 'SCRITCHLEY_GO', + _WriterId_TRINO_WRITER: 'TRINO', + } + if writer_id in writer_id_map: + return writer_id_map[writer_id] + raise ValueError('Unsupported writer ID') + + +cdef writer_version_from_enum(WriterVersion writer_version): + writer_version_map = { + _WriterVersion_ORIGINAL: 'ORIGINAL', + _WriterVersion_HIVE_8732: 'HIVE_8732', + _WriterVersion_HIVE_4243: 'HIVE_4243', + _WriterVersion_HIVE_12055: 'HIVE_12055', + _WriterVersion_HIVE_13083: 'HIVE_13083', + _WriterVersion_ORC_101: 'ORC_101', + _WriterVersion_ORC_135: 'ORC_135', + _WriterVersion_ORC_517: 'ORC_517', + _WriterVersion_ORC_203: 'ORC_203', + _WriterVersion_ORC_14: 'ORC_14', + } + if writer_version in writer_version_map: + return writer_version_map[writer_version] + raise ValueError('Unsupported writer version') + + +cdef shared_ptr[WriteOptions] _create_write_options( + file_version=None, + batch_size=None, + stripe_size=None, + compression=None, + compression_block_size=None, + compression_strategy=None, + row_index_stride=None, + padding_tolerance=None, + dictionary_key_size_threshold=None, + bloom_filter_columns=None, + bloom_filter_fpp=None +) except *: + """General writer options""" + cdef: + shared_ptr[WriteOptions] options + options = make_shared[WriteOptions]() + # batch_size + if batch_size is not None: + if isinstance(batch_size, int) and batch_size > 0: + deref(options).batch_size = batch_size + else: + raise ValueError(f"Invalid ORC writer batch size: {batch_size}") + # file_version + if file_version is not None: + if file_version == "0.12": + deref(options).file_version = FileVersion(0, 12) + elif file_version == "0.11": + deref(options).file_version = FileVersion(0, 11) + else: + raise ValueError(f"Unsupported ORC file version: {file_version}") + # stripe_size + if stripe_size is not None: + if isinstance(stripe_size, int) and stripe_size > 0: + deref(options).stripe_size = stripe_size + else: + raise ValueError(f"Invalid ORC stripe size: {stripe_size}") + # compression + if compression is not None: + if isinstance(compression, str): + deref(options).compression = compression_type_from_name( + compression) + else: + raise TypeError("Unsupported ORC compression type: " + f"{compression}") + # compression_block_size + if compression_block_size is not None: + if (isinstance(compression_block_size, int) and + compression_block_size > 0): + deref(options).compression_block_size = compression_block_size + else: + raise ValueError("Invalid ORC compression block size: " + f"{compression_block_size}") + # compression_strategy + if compression_strategy is not None: + if isinstance(compression, str): + deref(options).compression_strategy = \ + compression_strategy_from_name(compression_strategy) + else: + raise TypeError("Unsupported ORC compression strategy: " + f"{compression_strategy}") + # row_index_stride + if row_index_stride is not None: + if isinstance(row_index_stride, int) and row_index_stride > 0: + deref(options).row_index_stride = row_index_stride + else: + raise ValueError("Invalid ORC row index stride: " + f"{row_index_stride}") + # padding_tolerance + if padding_tolerance is not None: + try: + padding_tolerance = float(padding_tolerance) + deref(options).padding_tolerance = padding_tolerance + except Exception: + raise ValueError("Invalid ORC padding tolerance: " + f"{padding_tolerance}") + # dictionary_key_size_threshold + if dictionary_key_size_threshold is not None: + try: + dictionary_key_size_threshold = float( + dictionary_key_size_threshold) + assert 0 <= dictionary_key_size_threshold <= 1 + deref(options).dictionary_key_size_threshold = \ + dictionary_key_size_threshold + except Exception: + raise ValueError("Invalid ORC dictionary key size threshold: " + f"{dictionary_key_size_threshold}") + # bloom_filter_columns + if bloom_filter_columns is not None: + try: + bloom_filter_columns = list(bloom_filter_columns) + for col in bloom_filter_columns: + assert isinstance(col, int) and col >= 0 + deref(options).bloom_filter_columns = bloom_filter_columns + except Exception: + raise ValueError("Invalid ORC BloomFilter columns: " + f"{bloom_filter_columns}") + # Max false positive rate of the Bloom Filter + if bloom_filter_fpp is not None: + try: + bloom_filter_fpp = float(bloom_filter_fpp) + assert 0 <= bloom_filter_fpp <= 1 + deref(options).bloom_filter_fpp = bloom_filter_fpp + except Exception: + raise ValueError("Invalid ORC BloomFilter false positive rate: " + f"{bloom_filter_fpp}") + return options + + +cdef class ORCReader(_Weakrefable): + cdef: + object source + CMemoryPool* allocator + unique_ptr[ORCFileReader] reader + + def __cinit__(self, MemoryPool memory_pool=None): + self.allocator = maybe_unbox_memory_pool(memory_pool) + + def open(self, object source, c_bool use_memory_map=True): + cdef: + shared_ptr[CRandomAccessFile] rd_handle + + self.source = source + + get_reader(source, use_memory_map, &rd_handle) + with nogil: + self.reader = move(GetResultValue( + ORCFileReader.Open(rd_handle, self.allocator) + )) + + def metadata(self): + """ + The arrow metadata for this file. + + Returns + ------- + metadata : pyarrow.KeyValueMetadata + """ + cdef: + shared_ptr[const CKeyValueMetadata] sp_arrow_metadata + + with nogil: + sp_arrow_metadata = GetResultValue( + deref(self.reader).ReadMetadata() + ) + + return pyarrow_wrap_metadata(sp_arrow_metadata) + + def schema(self): + """ + The arrow schema for this file. + + Returns + ------- + schema : pyarrow.Schema + """ + cdef: + shared_ptr[CSchema] sp_arrow_schema + + with nogil: + sp_arrow_schema = GetResultValue(deref(self.reader).ReadSchema()) + + return pyarrow_wrap_schema(sp_arrow_schema) + + def nrows(self): + return deref(self.reader).NumberOfRows() + + def nstripes(self): + return deref(self.reader).NumberOfStripes() + + def file_version(self): + return file_version_from_class(deref(self.reader).GetFileVersion()) + + def software_version(self): + return frombytes(deref(self.reader).GetSoftwareVersion()) + + def compression(self): + return compression_type_from_enum( + GetResultValue(deref(self.reader).GetCompression())) + + def compression_size(self): + return deref(self.reader).GetCompressionSize() + + def row_index_stride(self): + return deref(self.reader).GetRowIndexStride() + + def writer(self): + writer_name = writer_id_from_enum(deref(self.reader).GetWriterId()) + if writer_name == 'UNKNOWN': + return deref(self.reader).GetWriterIdValue() + else: + return writer_name + + def writer_version(self): + return writer_version_from_enum(deref(self.reader).GetWriterVersion()) + + def nstripe_statistics(self): + return deref(self.reader).GetNumberOfStripeStatistics() + + def content_length(self): + return deref(self.reader).GetContentLength() + + def stripe_statistics_length(self): + return deref(self.reader).GetStripeStatisticsLength() + + def file_footer_length(self): + return deref(self.reader).GetFileFooterLength() + + def file_postscript_length(self): + return deref(self.reader).GetFilePostscriptLength() + + def file_length(self): + return deref(self.reader).GetFileLength() + + def serialized_file_tail(self): + return deref(self.reader).GetSerializedFileTail() + + def read_stripe(self, n, columns=None): + cdef: + shared_ptr[CRecordBatch] sp_record_batch + int64_t stripe + std_vector[c_string] c_names + + stripe = n + + if columns is None: + with nogil: + sp_record_batch = GetResultValue( + deref(self.reader).ReadStripe(stripe) + ) + else: + c_names = [tobytes(name) for name in columns] + with nogil: + sp_record_batch = GetResultValue( + deref(self.reader).ReadStripe(stripe, c_names) + ) + + return pyarrow_wrap_batch(sp_record_batch) + + def read(self, columns=None): + cdef: + shared_ptr[CTable] sp_table + std_vector[c_string] c_names + + if columns is None: + with nogil: + sp_table = GetResultValue(deref(self.reader).Read()) + else: + c_names = [tobytes(name) for name in columns] + with nogil: + sp_table = GetResultValue(deref(self.reader).Read(c_names)) + + return pyarrow_wrap_table(sp_table) + + +cdef class ORCWriter(_Weakrefable): + cdef: + unique_ptr[ORCFileWriter] writer + shared_ptr[COutputStream] sink + c_bool own_sink + + def open(self, object where, *, + file_version=None, + batch_size=None, + stripe_size=None, + compression=None, + compression_block_size=None, + compression_strategy=None, + row_index_stride=None, + padding_tolerance=None, + dictionary_key_size_threshold=None, + bloom_filter_columns=None, + bloom_filter_fpp=None): + cdef: + shared_ptr[WriteOptions] write_options + c_string c_where + try: + where = _stringify_path(where) + except TypeError: + get_writer(where, &self.sink) + self.own_sink = False + else: + c_where = tobytes(where) + with nogil: + self.sink = GetResultValue(FileOutputStream.Open(c_where)) + self.own_sink = True + + write_options = _create_write_options( + file_version=file_version, + batch_size=batch_size, + stripe_size=stripe_size, + compression=compression, + compression_block_size=compression_block_size, + compression_strategy=compression_strategy, + row_index_stride=row_index_stride, + padding_tolerance=padding_tolerance, + dictionary_key_size_threshold=dictionary_key_size_threshold, + bloom_filter_columns=bloom_filter_columns, + bloom_filter_fpp=bloom_filter_fpp + ) + + with nogil: + self.writer = move(GetResultValue( + ORCFileWriter.Open(self.sink.get(), + deref(write_options)))) + + def write(self, Table table): + cdef: + shared_ptr[CTable] sp_table + sp_table = pyarrow_unwrap_table(table) + with nogil: + check_status(deref(self.writer).Write(deref(sp_table))) + + def close(self): + with nogil: + check_status(deref(self.writer).Close()) + if self.own_sink: + check_status(deref(self.sink).Close()) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..5fbd620d03cc92343d8aceed67a05cde8d7a2481 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.pxd new file mode 100644 index 0000000000000000000000000000000000000000..d6aebd8284f4a2a0a54d7bcbc9cdccbb03c7ef83 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.pxd @@ -0,0 +1,680 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ +# cython: language_level = 3 + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport (CChunkedArray, CScalar, CSchema, CStatus, + CTable, CMemoryPool, CBuffer, + CKeyValueMetadata, CRandomAccessFile, + COutputStream, CCacheOptions, + TimeUnit, CRecordBatchReader) +from pyarrow.lib cimport _Weakrefable + + +cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: + cdef cppclass Node: + pass + + cdef cppclass GroupNode(Node): + pass + + cdef cppclass PrimitiveNode(Node): + pass + + cdef cppclass ColumnPath: + c_string ToDotString() + vector[c_string] ToDotVector() + + +cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: + enum ParquetType" parquet::Type::type": + ParquetType_BOOLEAN" parquet::Type::BOOLEAN" + ParquetType_INT32" parquet::Type::INT32" + ParquetType_INT64" parquet::Type::INT64" + ParquetType_INT96" parquet::Type::INT96" + ParquetType_FLOAT" parquet::Type::FLOAT" + ParquetType_DOUBLE" parquet::Type::DOUBLE" + ParquetType_BYTE_ARRAY" parquet::Type::BYTE_ARRAY" + ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY" + + enum ParquetLogicalTypeId" parquet::LogicalType::Type::type": + ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED" + ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING" + ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP" + ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST" + ParquetLogicalType_ENUM" parquet::LogicalType::Type::ENUM" + ParquetLogicalType_DECIMAL" parquet::LogicalType::Type::DECIMAL" + ParquetLogicalType_DATE" parquet::LogicalType::Type::DATE" + ParquetLogicalType_TIME" parquet::LogicalType::Type::TIME" + ParquetLogicalType_TIMESTAMP" parquet::LogicalType::Type::TIMESTAMP" + ParquetLogicalType_INT" parquet::LogicalType::Type::INT" + ParquetLogicalType_FLOAT16" parquet::LogicalType::Type::FLOAT16" + ParquetLogicalType_JSON" parquet::LogicalType::Type::JSON" + ParquetLogicalType_BSON" parquet::LogicalType::Type::BSON" + ParquetLogicalType_UUID" parquet::LogicalType::Type::UUID" + ParquetLogicalType_NONE" parquet::LogicalType::Type::NONE" + + enum ParquetTimeUnit" parquet::LogicalType::TimeUnit::unit": + ParquetTimeUnit_UNKNOWN" parquet::LogicalType::TimeUnit::UNKNOWN" + ParquetTimeUnit_MILLIS" parquet::LogicalType::TimeUnit::MILLIS" + ParquetTimeUnit_MICROS" parquet::LogicalType::TimeUnit::MICROS" + ParquetTimeUnit_NANOS" parquet::LogicalType::TimeUnit::NANOS" + + enum ParquetConvertedType" parquet::ConvertedType::type": + ParquetConvertedType_NONE" parquet::ConvertedType::NONE" + ParquetConvertedType_UTF8" parquet::ConvertedType::UTF8" + ParquetConvertedType_MAP" parquet::ConvertedType::MAP" + ParquetConvertedType_MAP_KEY_VALUE \ + " parquet::ConvertedType::MAP_KEY_VALUE" + ParquetConvertedType_LIST" parquet::ConvertedType::LIST" + ParquetConvertedType_ENUM" parquet::ConvertedType::ENUM" + ParquetConvertedType_DECIMAL" parquet::ConvertedType::DECIMAL" + ParquetConvertedType_DATE" parquet::ConvertedType::DATE" + ParquetConvertedType_TIME_MILLIS" parquet::ConvertedType::TIME_MILLIS" + ParquetConvertedType_TIME_MICROS" parquet::ConvertedType::TIME_MICROS" + ParquetConvertedType_TIMESTAMP_MILLIS \ + " parquet::ConvertedType::TIMESTAMP_MILLIS" + ParquetConvertedType_TIMESTAMP_MICROS \ + " parquet::ConvertedType::TIMESTAMP_MICROS" + ParquetConvertedType_UINT_8" parquet::ConvertedType::UINT_8" + ParquetConvertedType_UINT_16" parquet::ConvertedType::UINT_16" + ParquetConvertedType_UINT_32" parquet::ConvertedType::UINT_32" + ParquetConvertedType_UINT_64" parquet::ConvertedType::UINT_64" + ParquetConvertedType_INT_8" parquet::ConvertedType::INT_8" + ParquetConvertedType_INT_16" parquet::ConvertedType::INT_16" + ParquetConvertedType_INT_32" parquet::ConvertedType::INT_32" + ParquetConvertedType_INT_64" parquet::ConvertedType::INT_64" + ParquetConvertedType_JSON" parquet::ConvertedType::JSON" + ParquetConvertedType_BSON" parquet::ConvertedType::BSON" + ParquetConvertedType_INTERVAL" parquet::ConvertedType::INTERVAL" + + enum ParquetRepetition" parquet::Repetition::type": + ParquetRepetition_REQUIRED" parquet::REPETITION::REQUIRED" + ParquetRepetition_OPTIONAL" parquet::REPETITION::OPTIONAL" + ParquetRepetition_REPEATED" parquet::REPETITION::REPEATED" + + enum ParquetEncoding" parquet::Encoding::type": + ParquetEncoding_PLAIN" parquet::Encoding::PLAIN" + ParquetEncoding_PLAIN_DICTIONARY" parquet::Encoding::PLAIN_DICTIONARY" + ParquetEncoding_RLE" parquet::Encoding::RLE" + ParquetEncoding_BIT_PACKED" parquet::Encoding::BIT_PACKED" + ParquetEncoding_DELTA_BINARY_PACKED \ + " parquet::Encoding::DELTA_BINARY_PACKED" + ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY \ + " parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY" + ParquetEncoding_DELTA_BYTE_ARRAY" parquet::Encoding::DELTA_BYTE_ARRAY" + ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY" + ParquetEncoding_BYTE_STREAM_SPLIT \ + " parquet::Encoding::BYTE_STREAM_SPLIT" + + enum ParquetCompression" parquet::Compression::type": + ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED" + ParquetCompression_SNAPPY" parquet::Compression::SNAPPY" + ParquetCompression_GZIP" parquet::Compression::GZIP" + ParquetCompression_LZO" parquet::Compression::LZO" + ParquetCompression_BROTLI" parquet::Compression::BROTLI" + ParquetCompression_LZ4" parquet::Compression::LZ4" + ParquetCompression_ZSTD" parquet::Compression::ZSTD" + + enum ParquetVersion" parquet::ParquetVersion::type": + ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0" + ParquetVersion_V2_0" parquet::ParquetVersion::PARQUET_2_0" + ParquetVersion_V2_4" parquet::ParquetVersion::PARQUET_2_4" + ParquetVersion_V2_6" parquet::ParquetVersion::PARQUET_2_6" + + enum ParquetSortOrder" parquet::SortOrder::type": + ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED" + ParquetSortOrder_UNSIGNED" parquet::SortOrder::UNSIGNED" + ParquetSortOrder_UNKNOWN" parquet::SortOrder::UNKNOWN" + + cdef cppclass CParquetLogicalType" parquet::LogicalType": + c_string ToString() const + c_string ToJSON() const + ParquetLogicalTypeId type() const + + cdef cppclass CParquetDecimalType \ + " parquet::DecimalLogicalType"(CParquetLogicalType): + int32_t precision() const + int32_t scale() const + + cdef cppclass CParquetIntType \ + " parquet::IntLogicalType"(CParquetLogicalType): + int bit_width() const + c_bool is_signed() const + + cdef cppclass CParquetTimeType \ + " parquet::TimeLogicalType"(CParquetLogicalType): + c_bool is_adjusted_to_utc() const + ParquetTimeUnit time_unit() const + + cdef cppclass CParquetTimestampType \ + " parquet::TimestampLogicalType"(CParquetLogicalType): + c_bool is_adjusted_to_utc() const + ParquetTimeUnit time_unit() const + + cdef cppclass ColumnDescriptor" parquet::ColumnDescriptor": + c_bool Equals(const ColumnDescriptor& other) + + shared_ptr[ColumnPath] path() + int16_t max_definition_level() + int16_t max_repetition_level() + + ParquetType physical_type() + const shared_ptr[const CParquetLogicalType]& logical_type() + ParquetConvertedType converted_type() + const c_string& name() + int type_length() + int type_precision() + int type_scale() + + cdef cppclass SchemaDescriptor: + const ColumnDescriptor* Column(int i) + shared_ptr[Node] schema() + GroupNode* group() + c_bool Equals(const SchemaDescriptor& other) + c_string ToString() + int num_columns() + + cdef c_string FormatStatValue(ParquetType parquet_type, c_string val) + + enum ParquetCipher" parquet::ParquetCipher::type": + ParquetCipher_AES_GCM_V1" parquet::ParquetCipher::AES_GCM_V1" + ParquetCipher_AES_GCM_CTR_V1" parquet::ParquetCipher::AES_GCM_CTR_V1" + + struct AadMetadata: + c_string aad_prefix + c_string aad_file_unique + c_bool supply_aad_prefix + + struct EncryptionAlgorithm: + ParquetCipher algorithm + AadMetadata aad + +cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: + cdef cppclass ColumnReader: + pass + + cdef cppclass BoolReader(ColumnReader): + pass + + cdef cppclass Int32Reader(ColumnReader): + pass + + cdef cppclass Int64Reader(ColumnReader): + pass + + cdef cppclass Int96Reader(ColumnReader): + pass + + cdef cppclass FloatReader(ColumnReader): + pass + + cdef cppclass DoubleReader(ColumnReader): + pass + + cdef cppclass ByteArrayReader(ColumnReader): + pass + + cdef cppclass RowGroupReader: + pass + + cdef cppclass CEncodedStatistics" parquet::EncodedStatistics": + const c_string& max() const + const c_string& min() const + int64_t null_count + int64_t distinct_count + bint has_min + bint has_max + bint has_null_count + bint has_distinct_count + + cdef cppclass ParquetByteArray" parquet::ByteArray": + uint32_t len + const uint8_t* ptr + + cdef cppclass ParquetFLBA" parquet::FLBA": + const uint8_t* ptr + + cdef cppclass CStatistics" parquet::Statistics": + int64_t null_count() const + int64_t distinct_count() const + int64_t num_values() const + bint HasMinMax() + bint HasNullCount() + bint HasDistinctCount() + c_bool Equals(const CStatistics&) const + void Reset() + c_string EncodeMin() + c_string EncodeMax() + CEncodedStatistics Encode() + void SetComparator() + ParquetType physical_type() const + const ColumnDescriptor* descr() const + + cdef cppclass CBoolStatistics" parquet::BoolStatistics"(CStatistics): + c_bool min() + c_bool max() + + cdef cppclass CInt32Statistics" parquet::Int32Statistics"(CStatistics): + int32_t min() + int32_t max() + + cdef cppclass CInt64Statistics" parquet::Int64Statistics"(CStatistics): + int64_t min() + int64_t max() + + cdef cppclass CFloatStatistics" parquet::FloatStatistics"(CStatistics): + float min() + float max() + + cdef cppclass CDoubleStatistics" parquet::DoubleStatistics"(CStatistics): + double min() + double max() + + cdef cppclass CByteArrayStatistics \ + " parquet::ByteArrayStatistics"(CStatistics): + ParquetByteArray min() + ParquetByteArray max() + + cdef cppclass CFLBAStatistics" parquet::FLBAStatistics"(CStatistics): + ParquetFLBA min() + ParquetFLBA max() + + cdef cppclass CColumnCryptoMetaData" parquet::ColumnCryptoMetaData": + shared_ptr[ColumnPath] path_in_schema() const + c_bool encrypted_with_footer_key() const + const c_string& key_metadata() const + + cdef cppclass ParquetIndexLocation" parquet::IndexLocation": + int64_t offset + int32_t length + + cdef cppclass CColumnChunkMetaData" parquet::ColumnChunkMetaData": + int64_t file_offset() const + const c_string& file_path() const + + c_bool is_metadata_set() const + ParquetType type() const + int64_t num_values() const + shared_ptr[ColumnPath] path_in_schema() const + bint is_stats_set() const + shared_ptr[CStatistics] statistics() const + ParquetCompression compression() const + const vector[ParquetEncoding]& encodings() const + c_bool Equals(const CColumnChunkMetaData&) const + + int64_t has_dictionary_page() const + int64_t dictionary_page_offset() const + int64_t data_page_offset() const + int64_t index_page_offset() const + int64_t total_compressed_size() const + int64_t total_uncompressed_size() const + unique_ptr[CColumnCryptoMetaData] crypto_metadata() const + optional[ParquetIndexLocation] GetColumnIndexLocation() const + optional[ParquetIndexLocation] GetOffsetIndexLocation() const + shared_ptr[const CKeyValueMetadata] key_value_metadata() const + + struct CSortingColumn" parquet::SortingColumn": + int column_idx + c_bool descending + c_bool nulls_first + + cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData": + c_bool Equals(const CRowGroupMetaData&) const + int num_columns() const + int64_t num_rows() const + int64_t total_byte_size() const + vector[CSortingColumn] sorting_columns() const + unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const + + cdef cppclass CFileMetaData" parquet::FileMetaData": + c_bool Equals(const CFileMetaData&) const + uint32_t size() + int num_columns() + int64_t num_rows() + int num_row_groups() + ParquetVersion version() + const c_string created_by() + int num_schema_elements() + + void set_file_path(const c_string& path) + void AppendRowGroups(const CFileMetaData& other) except + + + unique_ptr[CRowGroupMetaData] RowGroup(int i) + const SchemaDescriptor* schema() + shared_ptr[const CKeyValueMetadata] key_value_metadata() const + void WriteTo(COutputStream* dst) const + + inline c_bool is_encryption_algorithm_set() const + inline EncryptionAlgorithm encryption_algorithm() const + inline const c_string& footer_signing_key_metadata() const + + cdef shared_ptr[CFileMetaData] CFileMetaData_Make \ + " parquet::FileMetaData::Make"(const void* serialized_metadata, + uint32_t* metadata_len) + + cdef cppclass CReaderProperties" parquet::ReaderProperties": + c_bool is_buffered_stream_enabled() const + void enable_buffered_stream() + void disable_buffered_stream() + + void set_buffer_size(int64_t buf_size) + int64_t buffer_size() const + + void set_thrift_string_size_limit(int32_t size) + int32_t thrift_string_size_limit() const + + void set_thrift_container_size_limit(int32_t size) + int32_t thrift_container_size_limit() const + + void file_decryption_properties(shared_ptr[CFileDecryptionProperties] + decryption) + shared_ptr[CFileDecryptionProperties] file_decryption_properties() \ + const + + c_bool page_checksum_verification() const + void set_page_checksum_verification(c_bool check_crc) + + CReaderProperties default_reader_properties() + + cdef cppclass ArrowReaderProperties: + ArrowReaderProperties() + void set_read_dictionary(int column_index, c_bool read_dict) + c_bool read_dictionary() + void set_batch_size(int64_t batch_size) + int64_t batch_size() + void set_pre_buffer(c_bool pre_buffer) + c_bool pre_buffer() const + void set_cache_options(CCacheOptions options) + CCacheOptions cache_options() const + void set_coerce_int96_timestamp_unit(TimeUnit unit) + TimeUnit coerce_int96_timestamp_unit() const + + ArrowReaderProperties default_arrow_reader_properties() + + cdef cppclass ParquetFileReader: + shared_ptr[CFileMetaData] metadata() + + +cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: + cdef cppclass WriterProperties: + cppclass Builder: + Builder* data_page_version(ParquetDataPageVersion version) + Builder* version(ParquetVersion version) + Builder* compression(ParquetCompression codec) + Builder* compression(const c_string& path, + ParquetCompression codec) + Builder* compression_level(int compression_level) + Builder* compression_level(const c_string& path, + int compression_level) + Builder* encryption( + shared_ptr[CFileEncryptionProperties] + file_encryption_properties) + Builder* disable_dictionary() + Builder* enable_dictionary() + Builder* enable_dictionary(const c_string& path) + Builder* set_sorting_columns(vector[CSortingColumn] sorting_columns) + Builder* disable_statistics() + Builder* enable_statistics() + Builder* enable_statistics(const c_string& path) + Builder* enable_store_decimal_as_integer() + Builder* disable_store_decimal_as_integer() + Builder* data_pagesize(int64_t size) + Builder* encoding(ParquetEncoding encoding) + Builder* encoding(const c_string& path, + ParquetEncoding encoding) + Builder* max_row_group_length(int64_t size) + Builder* write_batch_size(int64_t batch_size) + Builder* dictionary_pagesize_limit(int64_t dictionary_pagesize_limit) + Builder* enable_write_page_index() + Builder* disable_write_page_index() + Builder* enable_page_checksum() + Builder* disable_page_checksum() + shared_ptr[WriterProperties] build() + + cdef cppclass ArrowWriterProperties: + cppclass Builder: + Builder() + Builder* disable_deprecated_int96_timestamps() + Builder* enable_deprecated_int96_timestamps() + Builder* coerce_timestamps(TimeUnit unit) + Builder* allow_truncated_timestamps() + Builder* disallow_truncated_timestamps() + Builder* store_schema() + Builder* enable_compliant_nested_types() + Builder* disable_compliant_nested_types() + Builder* set_engine_version(ArrowWriterEngineVersion version) + shared_ptr[ArrowWriterProperties] build() + c_bool support_deprecated_int96_timestamps() + + +cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: + cdef cppclass FileReader: + FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader) + + CStatus GetSchema(shared_ptr[CSchema]* out) + + CStatus ReadColumn(int i, shared_ptr[CChunkedArray]* out) + CStatus ReadSchemaField(int i, shared_ptr[CChunkedArray]* out) + + int num_row_groups() + CStatus ReadRowGroup(int i, shared_ptr[CTable]* out) + CStatus ReadRowGroup(int i, const vector[int]& column_indices, + shared_ptr[CTable]* out) + + CStatus ReadRowGroups(const vector[int]& row_groups, + shared_ptr[CTable]* out) + CStatus ReadRowGroups(const vector[int]& row_groups, + const vector[int]& column_indices, + shared_ptr[CTable]* out) + + CStatus GetRecordBatchReader(const vector[int]& row_group_indices, + const vector[int]& column_indices, + unique_ptr[CRecordBatchReader]* out) + CStatus GetRecordBatchReader(const vector[int]& row_group_indices, + unique_ptr[CRecordBatchReader]* out) + + CStatus ReadTable(shared_ptr[CTable]* out) + CStatus ReadTable(const vector[int]& column_indices, + shared_ptr[CTable]* out) + + CStatus ScanContents(vector[int] columns, int32_t column_batch_size, + int64_t* num_rows) + + const ParquetFileReader* parquet_reader() + + void set_use_threads(c_bool use_threads) + + void set_batch_size(int64_t batch_size) + + cdef cppclass FileReaderBuilder: + FileReaderBuilder() + CStatus Open(const shared_ptr[CRandomAccessFile]& file, + const CReaderProperties& properties, + const shared_ptr[CFileMetaData]& metadata) + + ParquetFileReader* raw_reader() + FileReaderBuilder* memory_pool(CMemoryPool*) + FileReaderBuilder* properties(const ArrowReaderProperties&) + CStatus Build(unique_ptr[FileReader]* out) + + CStatus FromParquetSchema( + const SchemaDescriptor* parquet_schema, + const ArrowReaderProperties& properties, + const shared_ptr[const CKeyValueMetadata]& key_value_metadata, + shared_ptr[CSchema]* out) + + CStatus StatisticsAsScalars(const CStatistics& Statistics, + shared_ptr[CScalar]* min, + shared_ptr[CScalar]* max) + +cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: + + CStatus ToParquetSchema( + const CSchema* arrow_schema, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + shared_ptr[SchemaDescriptor]* out) + + +cdef extern from "parquet/properties.h" namespace "parquet" nogil: + cdef enum ArrowWriterEngineVersion: + V1 "parquet::ArrowWriterProperties::V1", + V2 "parquet::ArrowWriterProperties::V2" + + cdef cppclass ParquetDataPageVersion: + pass + + cdef ParquetDataPageVersion ParquetDataPageVersion_V1 \ + " parquet::ParquetDataPageVersion::V1" + cdef ParquetDataPageVersion ParquetDataPageVersion_V2 \ + " parquet::ParquetDataPageVersion::V2" + +cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: + cdef cppclass FileWriter: + + @staticmethod + CResult[unique_ptr[FileWriter]] Open(const CSchema& schema, CMemoryPool* pool, + const shared_ptr[COutputStream]& sink, + const shared_ptr[WriterProperties]& properties, + const shared_ptr[ArrowWriterProperties]& arrow_properties) + + CStatus WriteTable(const CTable& table, int64_t chunk_size) + CStatus NewRowGroup(int64_t chunk_size) + CStatus Close() + CStatus AddKeyValueMetadata(const shared_ptr[const CKeyValueMetadata]& key_value_metadata) + + const shared_ptr[CFileMetaData] metadata() const + + CStatus WriteMetaDataFile( + const CFileMetaData& file_metadata, + const COutputStream* sink) + +cdef class FileEncryptionProperties: + """File-level encryption properties for the low-level API""" + cdef: + shared_ptr[CFileEncryptionProperties] properties + + @staticmethod + cdef inline FileEncryptionProperties wrap( + shared_ptr[CFileEncryptionProperties] properties): + + result = FileEncryptionProperties() + result.properties = properties + return result + + cdef inline shared_ptr[CFileEncryptionProperties] unwrap(self): + return self.properties + +cdef shared_ptr[WriterProperties] _create_writer_properties( + use_dictionary=*, + compression=*, + version=*, + write_statistics=*, + data_page_size=*, + compression_level=*, + use_byte_stream_split=*, + column_encoding=*, + data_page_version=*, + FileEncryptionProperties encryption_properties=*, + write_batch_size=*, + dictionary_pagesize_limit=*, + write_page_index=*, + write_page_checksum=*, + sorting_columns=*, + store_decimal_as_integer=*, +) except * + + +cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( + use_deprecated_int96_timestamps=*, + coerce_timestamps=*, + allow_truncated_timestamps=*, + writer_engine_version=*, + use_compliant_nested_type=*, + store_schema=*, +) except * + +cdef class ParquetSchema(_Weakrefable): + cdef: + FileMetaData parent # the FileMetaData owning the SchemaDescriptor + const SchemaDescriptor* schema + +cdef class FileMetaData(_Weakrefable): + cdef: + shared_ptr[CFileMetaData] sp_metadata + CFileMetaData* _metadata + ParquetSchema _schema + + cdef inline init(self, const shared_ptr[CFileMetaData]& metadata): + self.sp_metadata = metadata + self._metadata = metadata.get() + +cdef class RowGroupMetaData(_Weakrefable): + cdef: + int index # for pickling support + unique_ptr[CRowGroupMetaData] up_metadata + CRowGroupMetaData* metadata + FileMetaData parent + +cdef class ColumnChunkMetaData(_Weakrefable): + cdef: + unique_ptr[CColumnChunkMetaData] up_metadata + CColumnChunkMetaData* metadata + RowGroupMetaData parent + + cdef inline init(self, RowGroupMetaData parent, int i): + self.up_metadata = parent.metadata.ColumnChunk(i) + self.metadata = self.up_metadata.get() + self.parent = parent + +cdef class Statistics(_Weakrefable): + cdef: + shared_ptr[CStatistics] statistics + ColumnChunkMetaData parent + + cdef inline init(self, const shared_ptr[CStatistics]& statistics, + ColumnChunkMetaData parent): + self.statistics = statistics + self.parent = parent + +cdef extern from "parquet/encryption/encryption.h" namespace "parquet" nogil: + cdef cppclass CFileDecryptionProperties\ + " parquet::FileDecryptionProperties": + pass + + cdef cppclass CFileEncryptionProperties\ + " parquet::FileEncryptionProperties": + pass + +cdef class FileDecryptionProperties: + """File-level decryption properties for the low-level API""" + cdef: + shared_ptr[CFileDecryptionProperties] properties + + @staticmethod + cdef inline FileDecryptionProperties wrap( + shared_ptr[CFileDecryptionProperties] properties): + + result = FileDecryptionProperties() + result.properties = properties + return result + + cdef inline shared_ptr[CFileDecryptionProperties] unwrap(self): + return self.properties diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.pyx new file mode 100644 index 0000000000000000000000000000000000000000..254bfe3b09a9cd51d5c8d3207d5b59a9b5bdcd3d --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet.pyx @@ -0,0 +1,2266 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ + +from collections.abc import Sequence +from textwrap import indent +import warnings + +from cython.operator cimport dereference as deref +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_python cimport * +from pyarrow.lib cimport (_Weakrefable, Buffer, Schema, + check_status, + MemoryPool, maybe_unbox_memory_pool, + Table, KeyValueMetadata, + pyarrow_wrap_chunked_array, + pyarrow_wrap_schema, + pyarrow_unwrap_metadata, + pyarrow_unwrap_schema, + pyarrow_wrap_table, + pyarrow_wrap_batch, + pyarrow_wrap_scalar, + NativeFile, get_reader, get_writer, + string_to_timeunit) + +from pyarrow.lib import (ArrowException, NativeFile, BufferOutputStream, + _stringify_path, + tobytes, frombytes, is_threading_enabled) + +cimport cpython as cp + +_DEFAULT_ROW_GROUP_SIZE = 1024*1024 +_MAX_ROW_GROUP_SIZE = 64*1024*1024 + +cdef class Statistics(_Weakrefable): + """Statistics for a single column in a single row group.""" + + def __cinit__(self): + pass + + def __repr__(self): + return """{} + has_min_max: {} + min: {} + max: {} + null_count: {} + distinct_count: {} + num_values: {} + physical_type: {} + logical_type: {} + converted_type (legacy): {}""".format(object.__repr__(self), + self.has_min_max, + self.min, + self.max, + self.null_count, + self.distinct_count, + self.num_values, + self.physical_type, + str(self.logical_type), + self.converted_type) + + def to_dict(self): + """ + Get dictionary representation of statistics. + + Returns + ------- + dict + Dictionary with a key for each attribute of this class. + """ + d = dict( + has_min_max=self.has_min_max, + min=self.min, + max=self.max, + null_count=self.null_count, + distinct_count=self.distinct_count, + num_values=self.num_values, + physical_type=self.physical_type + ) + return d + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + + def equals(self, Statistics other): + """ + Return whether the two column statistics objects are equal. + + Parameters + ---------- + other : Statistics + Statistics to compare against. + + Returns + ------- + are_equal : bool + """ + return self.statistics.get().Equals(deref(other.statistics.get())) + + @property + def has_min_max(self): + """Whether min and max are present (bool).""" + return self.statistics.get().HasMinMax() + + @property + def has_null_count(self): + """Whether null count is present (bool).""" + return self.statistics.get().HasNullCount() + + @property + def has_distinct_count(self): + """Whether distinct count is preset (bool).""" + return self.statistics.get().HasDistinctCount() + + @property + def min_raw(self): + """Min value as physical type (bool, int, float, or bytes).""" + if self.has_min_max: + return _cast_statistic_raw_min(self.statistics.get()) + else: + return None + + @property + def max_raw(self): + """Max value as physical type (bool, int, float, or bytes).""" + if self.has_min_max: + return _cast_statistic_raw_max(self.statistics.get()) + else: + return None + + @property + def min(self): + """ + Min value as logical type. + + Returned as the Python equivalent of logical type, such as datetime.date + for dates and decimal.Decimal for decimals. + """ + if self.has_min_max: + min_scalar, _ = _cast_statistics(self.statistics.get()) + return min_scalar.as_py() + else: + return None + + @property + def max(self): + """ + Max value as logical type. + + Returned as the Python equivalent of logical type, such as datetime.date + for dates and decimal.Decimal for decimals. + """ + if self.has_min_max: + _, max_scalar = _cast_statistics(self.statistics.get()) + return max_scalar.as_py() + else: + return None + + @property + def null_count(self): + """Number of null values in chunk (int).""" + if self.has_null_count: + return self.statistics.get().null_count() + else: + return None + + @property + def distinct_count(self): + """Distinct number of values in chunk (int).""" + if self.has_distinct_count: + return self.statistics.get().distinct_count() + else: + return None + + @property + def num_values(self): + """Number of non-null values (int).""" + return self.statistics.get().num_values() + + @property + def physical_type(self): + """Physical type of column (str).""" + raw_physical_type = self.statistics.get().physical_type() + return physical_type_name_from_enum(raw_physical_type) + + @property + def logical_type(self): + """Logical type of column (:class:`ParquetLogicalType`).""" + return wrap_logical_type(self.statistics.get().descr().logical_type()) + + @property + def converted_type(self): + """Legacy converted type (str or None).""" + raw_converted_type = self.statistics.get().descr().converted_type() + return converted_type_name_from_enum(raw_converted_type) + + +cdef class ParquetLogicalType(_Weakrefable): + """Logical type of parquet type.""" + cdef: + shared_ptr[const CParquetLogicalType] type + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[const CParquetLogicalType]& type): + self.type = type + + def __repr__(self): + return "{}\n {}".format(object.__repr__(self), str(self)) + + def __str__(self): + return frombytes(self.type.get().ToString(), safe=True) + + def to_json(self): + """ + Get a JSON string containing type and type parameters. + + Returns + ------- + json : str + JSON representation of type, with at least a field called 'Type' + which contains the type name. If the type is parameterized, such + as a decimal with scale and precision, will contain those as fields + as well. + """ + return frombytes(self.type.get().ToJSON()) + + @property + def type(self): + """Name of the logical type (str).""" + return logical_type_name_from_enum(self.type.get().type()) + + +cdef wrap_logical_type(const shared_ptr[const CParquetLogicalType]& type): + cdef ParquetLogicalType out = ParquetLogicalType() + out.init(type) + return out + + +cdef _cast_statistic_raw_min(CStatistics* statistics): + cdef ParquetType physical_type = statistics.physical_type() + cdef uint32_t type_length = statistics.descr().type_length() + if physical_type == ParquetType_BOOLEAN: + return ( statistics).min() + elif physical_type == ParquetType_INT32: + return ( statistics).min() + elif physical_type == ParquetType_INT64: + return ( statistics).min() + elif physical_type == ParquetType_FLOAT: + return ( statistics).min() + elif physical_type == ParquetType_DOUBLE: + return ( statistics).min() + elif physical_type == ParquetType_BYTE_ARRAY: + return _box_byte_array(( statistics).min()) + elif physical_type == ParquetType_FIXED_LEN_BYTE_ARRAY: + return _box_flba(( statistics).min(), type_length) + + +cdef _cast_statistic_raw_max(CStatistics* statistics): + cdef ParquetType physical_type = statistics.physical_type() + cdef uint32_t type_length = statistics.descr().type_length() + if physical_type == ParquetType_BOOLEAN: + return ( statistics).max() + elif physical_type == ParquetType_INT32: + return ( statistics).max() + elif physical_type == ParquetType_INT64: + return ( statistics).max() + elif physical_type == ParquetType_FLOAT: + return ( statistics).max() + elif physical_type == ParquetType_DOUBLE: + return ( statistics).max() + elif physical_type == ParquetType_BYTE_ARRAY: + return _box_byte_array(( statistics).max()) + elif physical_type == ParquetType_FIXED_LEN_BYTE_ARRAY: + return _box_flba(( statistics).max(), type_length) + + +cdef _cast_statistics(CStatistics* statistics): + cdef: + shared_ptr[CScalar] c_min + shared_ptr[CScalar] c_max + check_status(StatisticsAsScalars(statistics[0], &c_min, &c_max)) + return (pyarrow_wrap_scalar(c_min), pyarrow_wrap_scalar(c_max)) + + +cdef _box_byte_array(ParquetByteArray val): + return cp.PyBytes_FromStringAndSize( val.ptr, val.len) + + +cdef _box_flba(ParquetFLBA val, uint32_t len): + return cp.PyBytes_FromStringAndSize( val.ptr, len) + + +cdef class ColumnChunkMetaData(_Weakrefable): + """Column metadata for a single row group.""" + + def __cinit__(self): + pass + + def __repr__(self): + statistics = indent(repr(self.statistics), 4 * ' ') + return """{0} + file_offset: {1} + file_path: {2} + physical_type: {3} + num_values: {4} + path_in_schema: {5} + is_stats_set: {6} + statistics: +{7} + compression: {8} + encodings: {9} + has_dictionary_page: {10} + dictionary_page_offset: {11} + data_page_offset: {12} + total_compressed_size: {13} + total_uncompressed_size: {14}""".format(object.__repr__(self), + self.file_offset, + self.file_path, + self.physical_type, + self.num_values, + self.path_in_schema, + self.is_stats_set, + statistics, + self.compression, + self.encodings, + self.has_dictionary_page, + self.dictionary_page_offset, + self.data_page_offset, + self.total_compressed_size, + self.total_uncompressed_size) + + def to_dict(self): + """ + Get dictionary representation of the column chunk metadata. + + Returns + ------- + dict + Dictionary with a key for each attribute of this class. + """ + statistics = self.statistics.to_dict() if self.is_stats_set else None + d = dict( + file_offset=self.file_offset, + file_path=self.file_path, + physical_type=self.physical_type, + num_values=self.num_values, + path_in_schema=self.path_in_schema, + is_stats_set=self.is_stats_set, + statistics=statistics, + compression=self.compression, + encodings=self.encodings, + has_dictionary_page=self.has_dictionary_page, + dictionary_page_offset=self.dictionary_page_offset, + data_page_offset=self.data_page_offset, + total_compressed_size=self.total_compressed_size, + total_uncompressed_size=self.total_uncompressed_size + ) + return d + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + + def equals(self, ColumnChunkMetaData other): + """ + Return whether the two column chunk metadata objects are equal. + + Parameters + ---------- + other : ColumnChunkMetaData + Metadata to compare against. + + Returns + ------- + are_equal : bool + """ + return self.metadata.Equals(deref(other.metadata)) + + @property + def file_offset(self): + """Offset into file where column chunk is located (int).""" + return self.metadata.file_offset() + + @property + def file_path(self): + """Optional file path if set (str or None).""" + return frombytes(self.metadata.file_path()) + + @property + def physical_type(self): + """Physical type of column (str).""" + return physical_type_name_from_enum(self.metadata.type()) + + @property + def num_values(self): + """Total number of values (int).""" + return self.metadata.num_values() + + @property + def path_in_schema(self): + """Nested path to field, separated by periods (str).""" + path = self.metadata.path_in_schema().get().ToDotString() + return frombytes(path) + + @property + def is_stats_set(self): + """Whether or not statistics are present in metadata (bool).""" + return self.metadata.is_stats_set() + + @property + def statistics(self): + """Statistics for column chunk (:class:`Statistics`).""" + if not self.metadata.is_stats_set(): + return None + statistics = Statistics() + statistics.init(self.metadata.statistics(), self) + return statistics + + @property + def compression(self): + """ + Type of compression used for column (str). + + One of 'UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD', + or 'UNKNOWN'. + """ + return compression_name_from_enum(self.metadata.compression()) + + @property + def encodings(self): + """ + Encodings used for column (tuple of str). + + One of 'PLAIN', 'BIT_PACKED', 'RLE', 'BYTE_STREAM_SPLIT', 'DELTA_BINARY_PACKED', + 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'. + """ + return tuple(map(encoding_name_from_enum, self.metadata.encodings())) + + @property + def has_dictionary_page(self): + """Whether there is dictionary data present in the column chunk (bool).""" + return bool(self.metadata.has_dictionary_page()) + + @property + def dictionary_page_offset(self): + """Offset of dictionary page relative to column chunk offset (int).""" + if self.has_dictionary_page: + return self.metadata.dictionary_page_offset() + else: + return None + + @property + def data_page_offset(self): + """Offset of data page relative to column chunk offset (int).""" + return self.metadata.data_page_offset() + + @property + def has_index_page(self): + """Not yet supported.""" + raise NotImplementedError('not supported in parquet-cpp') + + @property + def index_page_offset(self): + """Not yet supported.""" + raise NotImplementedError("parquet-cpp doesn't return valid values") + + @property + def total_compressed_size(self): + """Compressed size in bytes (int).""" + return self.metadata.total_compressed_size() + + @property + def total_uncompressed_size(self): + """Uncompressed size in bytes (int).""" + return self.metadata.total_uncompressed_size() + + @property + def has_offset_index(self): + """Whether the column chunk has an offset index""" + return self.metadata.GetOffsetIndexLocation().has_value() + + @property + def has_column_index(self): + """Whether the column chunk has a column index""" + return self.metadata.GetColumnIndexLocation().has_value() + + @property + def metadata(self): + """Additional metadata as key value pairs (dict[bytes, bytes]).""" + cdef: + unordered_map[c_string, c_string] metadata + const CKeyValueMetadata* underlying_metadata + underlying_metadata = self.metadata.key_value_metadata().get() + if underlying_metadata != NULL: + underlying_metadata.ToUnorderedMap(&metadata) + return metadata + else: + return None + + +cdef class SortingColumn: + """ + Sorting specification for a single column. + + Returned by :meth:`RowGroupMetaData.sorting_columns` and used in + :class:`ParquetWriter` to specify the sort order of the data. + + Parameters + ---------- + column_index : int + Index of column that data is sorted by. + descending : bool, default False + Whether column is sorted in descending order. + nulls_first : bool, default False + Whether null values appear before valid values. + + Notes + ----- + + Column indices are zero-based, refer only to leaf fields, and are in + depth-first order. This may make the column indices for nested schemas + different from what you expect. In most cases, it will be easier to + specify the sort order using column names instead of column indices + and converting using the ``from_ordering`` method. + + Examples + -------- + + In other APIs, sort order is specified by names, such as: + + >>> sort_order = [('id', 'ascending'), ('timestamp', 'descending')] + + For Parquet, the column index must be used instead: + + >>> import pyarrow.parquet as pq + >>> [pq.SortingColumn(0), pq.SortingColumn(1, descending=True)] + [SortingColumn(column_index=0, descending=False, nulls_first=False), SortingColumn(column_index=1, descending=True, nulls_first=False)] + + Convert the sort_order into the list of sorting columns with + ``from_ordering`` (note that the schema must be provided as well): + + >>> import pyarrow as pa + >>> schema = pa.schema([('id', pa.int64()), ('timestamp', pa.timestamp('ms'))]) + >>> sorting_columns = pq.SortingColumn.from_ordering(schema, sort_order) + >>> sorting_columns + (SortingColumn(column_index=0, descending=False, nulls_first=False), SortingColumn(column_index=1, descending=True, nulls_first=False)) + + Convert back to the sort order with ``to_ordering``: + + >>> pq.SortingColumn.to_ordering(schema, sorting_columns) + ((('id', 'ascending'), ('timestamp', 'descending')), 'at_end') + + See Also + -------- + RowGroupMetaData.sorting_columns + """ + cdef int column_index + cdef c_bool descending + cdef c_bool nulls_first + + def __init__(self, int column_index, c_bool descending=False, c_bool nulls_first=False): + self.column_index = column_index + self.descending = descending + self.nulls_first = nulls_first + + @classmethod + def from_ordering(cls, Schema schema, sort_keys, null_placement='at_end'): + """ + Create a tuple of SortingColumn objects from the same arguments as + :class:`pyarrow.compute.SortOptions`. + + Parameters + ---------- + schema : Schema + Schema of the input data. + sort_keys : Sequence of (name, order) tuples + Names of field/column keys (str) to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + null_placement : {'at_start', 'at_end'}, default 'at_end' + Where null values should appear in the sort order. + + Returns + ------- + sorting_columns : tuple of SortingColumn + """ + if null_placement == 'at_start': + nulls_first = True + elif null_placement == 'at_end': + nulls_first = False + else: + raise ValueError('null_placement must be "at_start" or "at_end"') + + col_map = _name_to_index_map(schema) + + sorting_columns = [] + + for sort_key in sort_keys: + if isinstance(sort_key, str): + name = sort_key + descending = False + elif (isinstance(sort_key, tuple) and len(sort_key) == 2 and + isinstance(sort_key[0], str) and + isinstance(sort_key[1], str)): + name, descending = sort_key + if descending == "descending": + descending = True + elif descending == "ascending": + descending = False + else: + raise ValueError("Invalid sort key direction: {0}" + .format(descending)) + else: + raise ValueError("Invalid sort key: {0}".format(sort_key)) + + try: + column_index = col_map[name] + except KeyError: + raise ValueError("Sort key name '{0}' not found in schema:\n{1}" + .format(name, schema)) + + sorting_columns.append( + cls(column_index, descending=descending, nulls_first=nulls_first) + ) + + return tuple(sorting_columns) + + @staticmethod + def to_ordering(Schema schema, sorting_columns): + """ + Convert a tuple of SortingColumn objects to the same format as + :class:`pyarrow.compute.SortOptions`. + + Parameters + ---------- + schema : Schema + Schema of the input data. + sorting_columns : tuple of SortingColumn + Columns to sort the input on. + + Returns + ------- + sort_keys : tuple of (name, order) tuples + null_placement : {'at_start', 'at_end'} + """ + col_map = {i: name for name, i in _name_to_index_map(schema).items()} + + sort_keys = [] + nulls_first = None + + for sorting_column in sorting_columns: + name = col_map[sorting_column.column_index] + if sorting_column.descending: + order = "descending" + else: + order = "ascending" + sort_keys.append((name, order)) + if nulls_first is None: + nulls_first = sorting_column.nulls_first + elif nulls_first != sorting_column.nulls_first: + raise ValueError("Sorting columns have inconsistent null placement") + + if nulls_first: + null_placement = "at_start" + else: + null_placement = "at_end" + + return tuple(sort_keys), null_placement + + def __repr__(self): + return """{}(column_index={}, descending={}, nulls_first={})""".format( + self.__class__.__name__, + self.column_index, self.descending, self.nulls_first) + + def __eq__(self, SortingColumn other): + return (self.column_index == other.column_index and + self.descending == other.descending and + self.nulls_first == other.nulls_first) + + def __hash__(self): + return hash((self.column_index, self.descending, self.nulls_first)) + + @property + def column_index(self): + """"Index of column data is sorted by (int).""" + return self.column_index + + @property + def descending(self): + """Whether column is sorted in descending order (bool).""" + return self.descending + + @property + def nulls_first(self): + """Whether null values appear before valid values (bool).""" + return self.nulls_first + + def to_dict(self): + """ + Get dictionary representation of the SortingColumn. + + Returns + ------- + dict + Dictionary with a key for each attribute of this class. + """ + d = dict( + column_index=self.column_index, + descending=self.descending, + nulls_first=self.nulls_first + ) + return d + + +cdef class RowGroupMetaData(_Weakrefable): + """Metadata for a single row group.""" + + def __cinit__(self, FileMetaData parent, int index): + if index < 0 or index >= parent.num_row_groups: + raise IndexError('{0} out of bounds'.format(index)) + self.up_metadata = parent._metadata.RowGroup(index) + self.metadata = self.up_metadata.get() + self.parent = parent + self.index = index + + def __reduce__(self): + return RowGroupMetaData, (self.parent, self.index) + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + + def equals(self, RowGroupMetaData other): + """ + Return whether the two row group metadata objects are equal. + + Parameters + ---------- + other : RowGroupMetaData + Metadata to compare against. + + Returns + ------- + are_equal : bool + """ + return self.metadata.Equals(deref(other.metadata)) + + def column(self, int i): + """ + Get column metadata at given index. + + Parameters + ---------- + i : int + Index of column to get metadata for. + + Returns + ------- + ColumnChunkMetaData + Metadata for column within this chunk. + """ + if i < 0 or i >= self.num_columns: + raise IndexError('{0} out of bounds'.format(i)) + chunk = ColumnChunkMetaData() + chunk.init(self, i) + return chunk + + def __repr__(self): + return """{0} + num_columns: {1} + num_rows: {2} + total_byte_size: {3} + sorting_columns: {4}""".format(object.__repr__(self), + self.num_columns, + self.num_rows, + self.total_byte_size, + self.sorting_columns) + + def to_dict(self): + """ + Get dictionary representation of the row group metadata. + + Returns + ------- + dict + Dictionary with a key for each attribute of this class. + """ + columns = [] + d = dict( + num_columns=self.num_columns, + num_rows=self.num_rows, + total_byte_size=self.total_byte_size, + columns=columns, + sorting_columns=[col.to_dict() for col in self.sorting_columns] + ) + for i in range(self.num_columns): + columns.append(self.column(i).to_dict()) + return d + + @property + def num_columns(self): + """Number of columns in this row group (int).""" + return self.metadata.num_columns() + + @property + def num_rows(self): + """Number of rows in this row group (int).""" + return self.metadata.num_rows() + + @property + def total_byte_size(self): + """Total byte size of all the uncompressed column data in this row group (int).""" + return self.metadata.total_byte_size() + + @property + def sorting_columns(self): + """Columns the row group is sorted by (tuple of :class:`SortingColumn`)).""" + out = [] + cdef vector[CSortingColumn] sorting_columns = self.metadata.sorting_columns() + for sorting_col in sorting_columns: + out.append(SortingColumn( + sorting_col.column_idx, + sorting_col.descending, + sorting_col.nulls_first + )) + return tuple(out) + + +def _reconstruct_filemetadata(Buffer serialized): + cdef: + FileMetaData metadata = FileMetaData.__new__(FileMetaData) + CBuffer *buffer = serialized.buffer.get() + uint32_t metadata_len = buffer.size() + + metadata.init(CFileMetaData_Make(buffer.data(), &metadata_len)) + + return metadata + + +cdef class FileMetaData(_Weakrefable): + """Parquet metadata for a single file.""" + + def __cinit__(self): + pass + + def __reduce__(self): + cdef: + NativeFile sink = BufferOutputStream() + COutputStream* c_sink = sink.get_output_stream().get() + with nogil: + self._metadata.WriteTo(c_sink) + + cdef Buffer buffer = sink.getvalue() + return _reconstruct_filemetadata, (buffer,) + + def __hash__(self): + return hash((self.schema, + self.num_rows, + self.num_row_groups, + self.format_version, + self.serialized_size)) + + def __repr__(self): + return """{0} + created_by: {1} + num_columns: {2} + num_rows: {3} + num_row_groups: {4} + format_version: {5} + serialized_size: {6}""".format(object.__repr__(self), + self.created_by, self.num_columns, + self.num_rows, self.num_row_groups, + self.format_version, + self.serialized_size) + + def to_dict(self): + """ + Get dictionary representation of the file metadata. + + Returns + ------- + dict + Dictionary with a key for each attribute of this class. + """ + row_groups = [] + d = dict( + created_by=self.created_by, + num_columns=self.num_columns, + num_rows=self.num_rows, + num_row_groups=self.num_row_groups, + row_groups=row_groups, + format_version=self.format_version, + serialized_size=self.serialized_size + ) + for i in range(self.num_row_groups): + row_groups.append(self.row_group(i).to_dict()) + return d + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + + def equals(self, FileMetaData other not None): + """ + Return whether the two file metadata objects are equal. + + Parameters + ---------- + other : FileMetaData + Metadata to compare against. + + Returns + ------- + are_equal : bool + """ + return self._metadata.Equals(deref(other._metadata)) + + @property + def schema(self): + """Schema of the file (:class:`ParquetSchema`).""" + if self._schema is None: + self._schema = ParquetSchema(self) + return self._schema + + @property + def serialized_size(self): + """Size of the original thrift encoded metadata footer (int).""" + return self._metadata.size() + + @property + def num_columns(self): + """Number of columns in file (int).""" + return self._metadata.num_columns() + + @property + def num_rows(self): + """Total number of rows in file (int).""" + return self._metadata.num_rows() + + @property + def num_row_groups(self): + """Number of row groups in file (int).""" + return self._metadata.num_row_groups() + + @property + def format_version(self): + """ + Parquet format version used in file (str, such as '1.0', '2.4'). + + If version is missing or unparsable, will default to assuming '2.6'. + """ + cdef ParquetVersion version = self._metadata.version() + if version == ParquetVersion_V1: + return '1.0' + elif version == ParquetVersion_V2_0: + return 'pseudo-2.0' + elif version == ParquetVersion_V2_4: + return '2.4' + elif version == ParquetVersion_V2_6: + return '2.6' + else: + warnings.warn('Unrecognized file version, assuming 2.6: {}' + .format(version)) + return '2.6' + + @property + def created_by(self): + """ + String describing source of the parquet file (str). + + This typically includes library name and version number. For example, Arrow 7.0's + writer returns 'parquet-cpp-arrow version 7.0.0'. + """ + return frombytes(self._metadata.created_by()) + + @property + def metadata(self): + """Additional metadata as key value pairs (dict[bytes, bytes]).""" + cdef: + unordered_map[c_string, c_string] metadata + const CKeyValueMetadata* underlying_metadata + underlying_metadata = self._metadata.key_value_metadata().get() + if underlying_metadata != NULL: + underlying_metadata.ToUnorderedMap(&metadata) + return metadata + else: + return None + + def row_group(self, int i): + """ + Get metadata for row group at index i. + + Parameters + ---------- + i : int + Row group index to get. + + Returns + ------- + row_group_metadata : RowGroupMetaData + """ + return RowGroupMetaData(self, i) + + def set_file_path(self, path): + """ + Set ColumnChunk file paths to the given value. + + This method modifies the ``file_path`` field of each ColumnChunk + in the FileMetaData to be a particular value. + + Parameters + ---------- + path : str + The file path to set on all ColumnChunks. + """ + cdef: + c_string c_path = tobytes(path) + self._metadata.set_file_path(c_path) + + def append_row_groups(self, FileMetaData other): + """ + Append row groups from other FileMetaData object. + + Parameters + ---------- + other : FileMetaData + Other metadata to append row groups from. + """ + cdef shared_ptr[CFileMetaData] c_metadata + + c_metadata = other.sp_metadata + self._metadata.AppendRowGroups(deref(c_metadata)) + + def write_metadata_file(self, where): + """ + Write the metadata to a metadata-only Parquet file. + + Parameters + ---------- + where : path or file-like object + Where to write the metadata. Should be a writable path on + the local filesystem, or a writable file-like object. + """ + cdef: + shared_ptr[COutputStream] sink + c_string c_where + + try: + where = _stringify_path(where) + except TypeError: + get_writer(where, &sink) + else: + c_where = tobytes(where) + with nogil: + sink = GetResultValue(FileOutputStream.Open(c_where)) + + with nogil: + check_status( + WriteMetaDataFile(deref(self._metadata), sink.get())) + + +cdef class ParquetSchema(_Weakrefable): + """A Parquet schema.""" + + def __cinit__(self, FileMetaData container): + self.parent = container + self.schema = container._metadata.schema() + + def __repr__(self): + return "{0}\n{1}".format( + object.__repr__(self), + frombytes(self.schema.ToString(), safe=True)) + + def __reduce__(self): + return ParquetSchema, (self.parent,) + + def __len__(self): + return self.schema.num_columns() + + def __getitem__(self, i): + return self.column(i) + + def __hash__(self): + return hash(self.schema.ToString()) + + @property + def names(self): + """Name of each field (list of str).""" + return [self[i].name for i in range(len(self))] + + def to_arrow_schema(self): + """ + Convert Parquet schema to effective Arrow schema. + + Returns + ------- + schema : Schema + """ + cdef shared_ptr[CSchema] sp_arrow_schema + + with nogil: + check_status(FromParquetSchema( + self.schema, default_arrow_reader_properties(), + self.parent._metadata.key_value_metadata(), + &sp_arrow_schema)) + + return pyarrow_wrap_schema(sp_arrow_schema) + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + + def equals(self, ParquetSchema other): + """ + Return whether the two schemas are equal. + + Parameters + ---------- + other : ParquetSchema + Schema to compare against. + + Returns + ------- + are_equal : bool + """ + return self.schema.Equals(deref(other.schema)) + + def column(self, i): + """ + Return the schema for a single column. + + Parameters + ---------- + i : int + Index of column in schema. + + Returns + ------- + column_schema : ColumnSchema + """ + if i < 0 or i >= len(self): + raise IndexError('{0} out of bounds'.format(i)) + + return ColumnSchema(self, i) + + +cdef class ColumnSchema(_Weakrefable): + """Schema for a single column.""" + cdef: + int index + ParquetSchema parent + const ColumnDescriptor* descr + + def __cinit__(self, ParquetSchema schema, int index): + self.parent = schema + self.index = index # for pickling support + self.descr = schema.schema.Column(index) + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + + def __reduce__(self): + return ColumnSchema, (self.parent, self.index) + + def equals(self, ColumnSchema other): + """ + Return whether the two column schemas are equal. + + Parameters + ---------- + other : ColumnSchema + Schema to compare against. + + Returns + ------- + are_equal : bool + """ + return self.descr.Equals(deref(other.descr)) + + def __repr__(self): + physical_type = self.physical_type + converted_type = self.converted_type + if converted_type == 'DECIMAL': + converted_type = 'DECIMAL({0}, {1})'.format(self.precision, + self.scale) + elif physical_type == 'FIXED_LEN_BYTE_ARRAY': + converted_type = ('FIXED_LEN_BYTE_ARRAY(length={0})' + .format(self.length)) + + return """ + name: {0} + path: {1} + max_definition_level: {2} + max_repetition_level: {3} + physical_type: {4} + logical_type: {5} + converted_type (legacy): {6}""".format(self.name, self.path, + self.max_definition_level, + self.max_repetition_level, + physical_type, + str(self.logical_type), + converted_type) + + @property + def name(self): + """Name of field (str).""" + return frombytes(self.descr.name()) + + @property + def path(self): + """Nested path to field, separated by periods (str).""" + return frombytes(self.descr.path().get().ToDotString()) + + @property + def max_definition_level(self): + """Maximum definition level (int).""" + return self.descr.max_definition_level() + + @property + def max_repetition_level(self): + """Maximum repetition level (int).""" + return self.descr.max_repetition_level() + + @property + def physical_type(self): + """Name of physical type (str).""" + return physical_type_name_from_enum(self.descr.physical_type()) + + @property + def logical_type(self): + """Logical type of column (:class:`ParquetLogicalType`).""" + return wrap_logical_type(self.descr.logical_type()) + + @property + def converted_type(self): + """Legacy converted type (str or None).""" + return converted_type_name_from_enum(self.descr.converted_type()) + + # FIXED_LEN_BYTE_ARRAY attribute + @property + def length(self): + """Array length if fixed length byte array type, None otherwise (int or None).""" + return self.descr.type_length() + + # Decimal attributes + @property + def precision(self): + """Precision if decimal type, None otherwise (int or None).""" + return self.descr.type_precision() + + @property + def scale(self): + """Scale if decimal type, None otherwise (int or None).""" + return self.descr.type_scale() + + +cdef physical_type_name_from_enum(ParquetType type_): + return { + ParquetType_BOOLEAN: 'BOOLEAN', + ParquetType_INT32: 'INT32', + ParquetType_INT64: 'INT64', + ParquetType_INT96: 'INT96', + ParquetType_FLOAT: 'FLOAT', + ParquetType_DOUBLE: 'DOUBLE', + ParquetType_BYTE_ARRAY: 'BYTE_ARRAY', + ParquetType_FIXED_LEN_BYTE_ARRAY: 'FIXED_LEN_BYTE_ARRAY', + }.get(type_, 'UNKNOWN') + + +cdef logical_type_name_from_enum(ParquetLogicalTypeId type_): + return { + ParquetLogicalType_UNDEFINED: 'UNDEFINED', + ParquetLogicalType_STRING: 'STRING', + ParquetLogicalType_MAP: 'MAP', + ParquetLogicalType_LIST: 'LIST', + ParquetLogicalType_ENUM: 'ENUM', + ParquetLogicalType_DECIMAL: 'DECIMAL', + ParquetLogicalType_DATE: 'DATE', + ParquetLogicalType_TIME: 'TIME', + ParquetLogicalType_TIMESTAMP: 'TIMESTAMP', + ParquetLogicalType_INT: 'INT', + ParquetLogicalType_FLOAT16: 'FLOAT16', + ParquetLogicalType_JSON: 'JSON', + ParquetLogicalType_BSON: 'BSON', + ParquetLogicalType_UUID: 'UUID', + ParquetLogicalType_NONE: 'NONE', + }.get(type_, 'UNKNOWN') + + +cdef converted_type_name_from_enum(ParquetConvertedType type_): + return { + ParquetConvertedType_NONE: 'NONE', + ParquetConvertedType_UTF8: 'UTF8', + ParquetConvertedType_MAP: 'MAP', + ParquetConvertedType_MAP_KEY_VALUE: 'MAP_KEY_VALUE', + ParquetConvertedType_LIST: 'LIST', + ParquetConvertedType_ENUM: 'ENUM', + ParquetConvertedType_DECIMAL: 'DECIMAL', + ParquetConvertedType_DATE: 'DATE', + ParquetConvertedType_TIME_MILLIS: 'TIME_MILLIS', + ParquetConvertedType_TIME_MICROS: 'TIME_MICROS', + ParquetConvertedType_TIMESTAMP_MILLIS: 'TIMESTAMP_MILLIS', + ParquetConvertedType_TIMESTAMP_MICROS: 'TIMESTAMP_MICROS', + ParquetConvertedType_UINT_8: 'UINT_8', + ParquetConvertedType_UINT_16: 'UINT_16', + ParquetConvertedType_UINT_32: 'UINT_32', + ParquetConvertedType_UINT_64: 'UINT_64', + ParquetConvertedType_INT_8: 'INT_8', + ParquetConvertedType_INT_16: 'INT_16', + ParquetConvertedType_INT_32: 'INT_32', + ParquetConvertedType_INT_64: 'INT_64', + ParquetConvertedType_JSON: 'JSON', + ParquetConvertedType_BSON: 'BSON', + ParquetConvertedType_INTERVAL: 'INTERVAL', + }.get(type_, 'UNKNOWN') + + +cdef encoding_name_from_enum(ParquetEncoding encoding_): + return { + ParquetEncoding_PLAIN: 'PLAIN', + ParquetEncoding_PLAIN_DICTIONARY: 'PLAIN_DICTIONARY', + ParquetEncoding_RLE: 'RLE', + ParquetEncoding_BIT_PACKED: 'BIT_PACKED', + ParquetEncoding_DELTA_BINARY_PACKED: 'DELTA_BINARY_PACKED', + ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: 'DELTA_LENGTH_BYTE_ARRAY', + ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY', + ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY', + ParquetEncoding_BYTE_STREAM_SPLIT: 'BYTE_STREAM_SPLIT', + }.get(encoding_, 'UNKNOWN') + + +cdef encoding_enum_from_name(str encoding_name): + enc = { + 'PLAIN': ParquetEncoding_PLAIN, + 'BIT_PACKED': ParquetEncoding_BIT_PACKED, + 'RLE': ParquetEncoding_RLE, + 'BYTE_STREAM_SPLIT': ParquetEncoding_BYTE_STREAM_SPLIT, + 'DELTA_BINARY_PACKED': ParquetEncoding_DELTA_BINARY_PACKED, + 'DELTA_LENGTH_BYTE_ARRAY': ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY, + 'DELTA_BYTE_ARRAY': ParquetEncoding_DELTA_BYTE_ARRAY, + 'RLE_DICTIONARY': 'dict', + 'PLAIN_DICTIONARY': 'dict', + }.get(encoding_name, None) + if enc is None: + raise ValueError(f"Unsupported column encoding: {encoding_name!r}") + elif enc == 'dict': + raise ValueError(f"{encoding_name!r} is already used by default.") + else: + return enc + + +cdef compression_name_from_enum(ParquetCompression compression_): + return { + ParquetCompression_UNCOMPRESSED: 'UNCOMPRESSED', + ParquetCompression_SNAPPY: 'SNAPPY', + ParquetCompression_GZIP: 'GZIP', + ParquetCompression_LZO: 'LZO', + ParquetCompression_BROTLI: 'BROTLI', + ParquetCompression_LZ4: 'LZ4', + ParquetCompression_ZSTD: 'ZSTD', + }.get(compression_, 'UNKNOWN') + + +cdef int check_compression_name(name) except -1: + if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', + 'ZSTD'}: + raise ArrowException("Unsupported compression: " + name) + return 0 + + +cdef ParquetCompression compression_from_name(name): + name = name.upper() + if name == 'SNAPPY': + return ParquetCompression_SNAPPY + elif name == 'GZIP': + return ParquetCompression_GZIP + elif name == 'LZO': + return ParquetCompression_LZO + elif name == 'BROTLI': + return ParquetCompression_BROTLI + elif name == 'LZ4': + return ParquetCompression_LZ4 + elif name == 'ZSTD': + return ParquetCompression_ZSTD + else: + return ParquetCompression_UNCOMPRESSED + + +cdef class ParquetReader(_Weakrefable): + cdef: + object source + CMemoryPool* pool + UniquePtrNoGIL[FileReader] reader + FileMetaData _metadata + shared_ptr[CRandomAccessFile] rd_handle + + cdef public: + _column_idx_map + + def __cinit__(self, MemoryPool memory_pool=None): + self.pool = maybe_unbox_memory_pool(memory_pool) + self._metadata = None + + def open(self, object source not None, *, bint use_memory_map=False, + read_dictionary=None, FileMetaData metadata=None, + int buffer_size=0, bint pre_buffer=False, + coerce_int96_timestamp_unit=None, + FileDecryptionProperties decryption_properties=None, + thrift_string_size_limit=None, + thrift_container_size_limit=None, + page_checksum_verification=False): + """ + Open a parquet file for reading. + + Parameters + ---------- + source : str, pathlib.Path, pyarrow.NativeFile, or file-like object + use_memory_map : bool, default False + read_dictionary : iterable[int or str], optional + metadata : FileMetaData, optional + buffer_size : int, default 0 + pre_buffer : bool, default False + coerce_int96_timestamp_unit : str, optional + decryption_properties : FileDecryptionProperties, optional + thrift_string_size_limit : int, optional + thrift_container_size_limit : int, optional + page_checksum_verification : bool, default False + """ + cdef: + shared_ptr[CFileMetaData] c_metadata + CReaderProperties properties = default_reader_properties() + ArrowReaderProperties arrow_props = ( + default_arrow_reader_properties()) + FileReaderBuilder builder + + if pre_buffer and not is_threading_enabled(): + pre_buffer = False + + if metadata is not None: + c_metadata = metadata.sp_metadata + + if buffer_size > 0: + properties.enable_buffered_stream() + properties.set_buffer_size(buffer_size) + elif buffer_size == 0: + properties.disable_buffered_stream() + else: + raise ValueError('Buffer size must be larger than zero') + + if thrift_string_size_limit is not None: + if thrift_string_size_limit <= 0: + raise ValueError("thrift_string_size_limit " + "must be larger than zero") + properties.set_thrift_string_size_limit(thrift_string_size_limit) + if thrift_container_size_limit is not None: + if thrift_container_size_limit <= 0: + raise ValueError("thrift_container_size_limit " + "must be larger than zero") + properties.set_thrift_container_size_limit( + thrift_container_size_limit) + + if decryption_properties is not None: + properties.file_decryption_properties( + decryption_properties.unwrap()) + + arrow_props.set_pre_buffer(pre_buffer) + + properties.set_page_checksum_verification(page_checksum_verification) + + if coerce_int96_timestamp_unit is None: + # use the default defined in default_arrow_reader_properties() + pass + else: + arrow_props.set_coerce_int96_timestamp_unit( + string_to_timeunit(coerce_int96_timestamp_unit)) + + self.source = source + get_reader(source, use_memory_map, &self.rd_handle) + + with nogil: + check_status(builder.Open(self.rd_handle, properties, c_metadata)) + + # Set up metadata + with nogil: + c_metadata = builder.raw_reader().metadata() + self._metadata = result = FileMetaData() + result.init(c_metadata) + + if read_dictionary is not None: + self._set_read_dictionary(read_dictionary, &arrow_props) + + with nogil: + check_status(builder.memory_pool(self.pool) + .properties(arrow_props) + .Build(&self.reader)) + + cdef _set_read_dictionary(self, read_dictionary, + ArrowReaderProperties* props): + for column in read_dictionary: + if not isinstance(column, int): + column = self.column_name_idx(column) + props.set_read_dictionary(column, True) + + @property + def column_paths(self): + cdef: + FileMetaData container = self.metadata + const CFileMetaData* metadata = container._metadata + vector[c_string] path + int i = 0 + + paths = [] + for i in range(0, metadata.num_columns()): + path = (metadata.schema().Column(i) + .path().get().ToDotVector()) + paths.append([frombytes(x) for x in path]) + + return paths + + @property + def metadata(self): + return self._metadata + + @property + def schema_arrow(self): + cdef shared_ptr[CSchema] out + with nogil: + check_status(self.reader.get().GetSchema(&out)) + return pyarrow_wrap_schema(out) + + @property + def num_row_groups(self): + return self.reader.get().num_row_groups() + + def set_use_threads(self, bint use_threads): + """ + Parameters + ---------- + use_threads : bool + """ + if is_threading_enabled(): + self.reader.get().set_use_threads(use_threads) + else: + self.reader.get().set_use_threads(False) + + def set_batch_size(self, int64_t batch_size): + """ + Parameters + ---------- + batch_size : int64 + """ + self.reader.get().set_batch_size(batch_size) + + def iter_batches(self, int64_t batch_size, row_groups, column_indices=None, + bint use_threads=True): + """ + Parameters + ---------- + batch_size : int64 + row_groups : list[int] + column_indices : list[int], optional + use_threads : bool, default True + + Yields + ------ + next : RecordBatch + """ + cdef: + vector[int] c_row_groups + vector[int] c_column_indices + shared_ptr[CRecordBatch] record_batch + UniquePtrNoGIL[CRecordBatchReader] recordbatchreader + + self.set_batch_size(batch_size) + + if use_threads: + self.set_use_threads(use_threads) + + for row_group in row_groups: + c_row_groups.push_back(row_group) + + if column_indices is not None: + for index in column_indices: + c_column_indices.push_back(index) + with nogil: + check_status( + self.reader.get().GetRecordBatchReader( + c_row_groups, c_column_indices, &recordbatchreader + ) + ) + else: + with nogil: + check_status( + self.reader.get().GetRecordBatchReader( + c_row_groups, &recordbatchreader + ) + ) + + while True: + with nogil: + check_status( + recordbatchreader.get().ReadNext(&record_batch) + ) + if record_batch.get() == NULL: + break + + yield pyarrow_wrap_batch(record_batch) + + def read_row_group(self, int i, column_indices=None, + bint use_threads=True): + """ + Parameters + ---------- + i : int + column_indices : list[int], optional + use_threads : bool, default True + + Returns + ------- + table : pyarrow.Table + """ + return self.read_row_groups([i], column_indices, use_threads) + + def read_row_groups(self, row_groups not None, column_indices=None, + bint use_threads=True): + """ + Parameters + ---------- + row_groups : list[int] + column_indices : list[int], optional + use_threads : bool, default True + + Returns + ------- + table : pyarrow.Table + """ + cdef: + shared_ptr[CTable] ctable + vector[int] c_row_groups + vector[int] c_column_indices + + self.set_use_threads(use_threads) + + for row_group in row_groups: + c_row_groups.push_back(row_group) + + if column_indices is not None: + for index in column_indices: + c_column_indices.push_back(index) + + with nogil: + check_status(self.reader.get() + .ReadRowGroups(c_row_groups, c_column_indices, + &ctable)) + else: + # Read all columns + with nogil: + check_status(self.reader.get() + .ReadRowGroups(c_row_groups, &ctable)) + return pyarrow_wrap_table(ctable) + + def read_all(self, column_indices=None, bint use_threads=True): + """ + Parameters + ---------- + column_indices : list[int], optional + use_threads : bool, default True + + Returns + ------- + table : pyarrow.Table + """ + cdef: + shared_ptr[CTable] ctable + vector[int] c_column_indices + + self.set_use_threads(use_threads) + + if column_indices is not None: + for index in column_indices: + c_column_indices.push_back(index) + + with nogil: + check_status(self.reader.get() + .ReadTable(c_column_indices, &ctable)) + else: + # Read all columns + with nogil: + check_status(self.reader.get() + .ReadTable(&ctable)) + return pyarrow_wrap_table(ctable) + + def scan_contents(self, column_indices=None, batch_size=65536): + """ + Parameters + ---------- + column_indices : list[int], optional + batch_size : int32, default 65536 + + Returns + ------- + num_rows : int64 + """ + cdef: + vector[int] c_column_indices + int32_t c_batch_size + int64_t c_num_rows + + if column_indices is not None: + for index in column_indices: + c_column_indices.push_back(index) + + c_batch_size = batch_size + + with nogil: + check_status(self.reader.get() + .ScanContents(c_column_indices, c_batch_size, + &c_num_rows)) + + return c_num_rows + + def column_name_idx(self, column_name): + """ + Find the index of a column by its name. + + Parameters + ---------- + column_name : str + Name of the column; separation of nesting levels is done via ".". + + Returns + ------- + column_idx : int + Integer index of the column in the schema. + """ + cdef: + FileMetaData container = self.metadata + const CFileMetaData* metadata = container._metadata + int i = 0 + + if self._column_idx_map is None: + self._column_idx_map = {} + for i in range(0, metadata.num_columns()): + col_bytes = tobytes(metadata.schema().Column(i) + .path().get().ToDotString()) + self._column_idx_map[col_bytes] = i + + return self._column_idx_map[tobytes(column_name)] + + def read_column(self, int column_index): + """ + Read the column at the specified index. + + Parameters + ---------- + column_index : int + Index of the column. + + Returns + ------- + column : pyarrow.ChunkedArray + """ + cdef shared_ptr[CChunkedArray] out + with nogil: + check_status(self.reader.get() + .ReadColumn(column_index, &out)) + return pyarrow_wrap_chunked_array(out) + + def close(self): + if not self.closed: + with nogil: + check_status(self.rd_handle.get().Close()) + + @property + def closed(self): + if self.rd_handle == NULL: + return True + with nogil: + closed = self.rd_handle.get().closed() + return closed + + +cdef CSortingColumn _convert_sorting_column(SortingColumn sorting_column): + cdef CSortingColumn c_sorting_column + + c_sorting_column.column_idx = sorting_column.column_index + c_sorting_column.descending = sorting_column.descending + c_sorting_column.nulls_first = sorting_column.nulls_first + + return c_sorting_column + + +cdef vector[CSortingColumn] _convert_sorting_columns(sorting_columns) except *: + if not (isinstance(sorting_columns, Sequence) + and all(isinstance(col, SortingColumn) for col in sorting_columns)): + raise ValueError( + "'sorting_columns' must be a list of `SortingColumn`") + + cdef vector[CSortingColumn] c_sorting_columns = [_convert_sorting_column(col) + for col in sorting_columns] + + return c_sorting_columns + + +cdef shared_ptr[WriterProperties] _create_writer_properties( + use_dictionary=None, + compression=None, + version=None, + write_statistics=None, + data_page_size=None, + compression_level=None, + use_byte_stream_split=False, + column_encoding=None, + data_page_version=None, + FileEncryptionProperties encryption_properties=None, + write_batch_size=None, + dictionary_pagesize_limit=None, + write_page_index=False, + write_page_checksum=False, + sorting_columns=None, + store_decimal_as_integer=False) except *: + + """General writer properties""" + cdef: + shared_ptr[WriterProperties] properties + WriterProperties.Builder props + + # data_page_version + + if data_page_version is not None: + if data_page_version == "1.0": + props.data_page_version(ParquetDataPageVersion_V1) + elif data_page_version == "2.0": + props.data_page_version(ParquetDataPageVersion_V2) + else: + raise ValueError("Unsupported Parquet data page version: {0}" + .format(data_page_version)) + + # version + + if version is not None: + if version == "1.0": + props.version(ParquetVersion_V1) + elif version in ("2.0", "pseudo-2.0"): + warnings.warn( + "Parquet format '2.0' pseudo version is deprecated, use " + "'2.4' or '2.6' for fine-grained feature selection", + FutureWarning, stacklevel=2) + props.version(ParquetVersion_V2_0) + elif version == "2.4": + props.version(ParquetVersion_V2_4) + elif version == "2.6": + props.version(ParquetVersion_V2_6) + else: + raise ValueError("Unsupported Parquet format version: {0}" + .format(version)) + + # compression + + if isinstance(compression, basestring): + check_compression_name(compression) + props.compression(compression_from_name(compression)) + elif compression is not None: + for column, codec in compression.iteritems(): + check_compression_name(codec) + props.compression(tobytes(column), compression_from_name(codec)) + + if isinstance(compression_level, int): + props.compression_level(compression_level) + elif compression_level is not None: + for column, level in compression_level.iteritems(): + props.compression_level(tobytes(column), level) + + # use_dictionary + + if isinstance(use_dictionary, bool): + if use_dictionary: + props.enable_dictionary() + if column_encoding is not None: + raise ValueError( + "To use 'column_encoding' set 'use_dictionary' to False") + else: + props.disable_dictionary() + elif use_dictionary is not None: + # Deactivate dictionary encoding by default + props.disable_dictionary() + for column in use_dictionary: + props.enable_dictionary(tobytes(column)) + if (column_encoding is not None and + column_encoding.get(column) is not None): + raise ValueError( + "To use 'column_encoding' set 'use_dictionary' to False") + + # write_statistics + + if isinstance(write_statistics, bool): + if write_statistics: + props.enable_statistics() + else: + props.disable_statistics() + elif write_statistics is not None: + # Deactivate statistics by default and enable for specified columns + props.disable_statistics() + for column in write_statistics: + props.enable_statistics(tobytes(column)) + + # sorting_columns + + if sorting_columns is not None: + props.set_sorting_columns(_convert_sorting_columns(sorting_columns)) + + # use_byte_stream_split + + if isinstance(use_byte_stream_split, bool): + if use_byte_stream_split: + if column_encoding is not None: + raise ValueError( + "'use_byte_stream_split' cannot be passed" + "together with 'column_encoding'") + else: + props.encoding(ParquetEncoding_BYTE_STREAM_SPLIT) + elif use_byte_stream_split is not None: + for column in use_byte_stream_split: + if column_encoding is None: + column_encoding = {column: 'BYTE_STREAM_SPLIT'} + elif column_encoding.get(column, None) is None: + column_encoding[column] = 'BYTE_STREAM_SPLIT' + else: + raise ValueError( + "'use_byte_stream_split' cannot be passed" + "together with 'column_encoding'") + + # store_decimal_as_integer + + if isinstance(store_decimal_as_integer, bool): + if store_decimal_as_integer: + props.enable_store_decimal_as_integer() + else: + props.disable_store_decimal_as_integer() + else: + raise TypeError("'store_decimal_as_integer' must be a boolean") + + # column_encoding + # encoding map - encode individual columns + + if column_encoding is not None: + if isinstance(column_encoding, dict): + for column, _encoding in column_encoding.items(): + props.encoding(tobytes(column), + encoding_enum_from_name(_encoding)) + elif isinstance(column_encoding, str): + props.encoding(encoding_enum_from_name(column_encoding)) + else: + raise TypeError( + "'column_encoding' should be a dictionary or a string") + + if data_page_size is not None: + props.data_pagesize(data_page_size) + + if write_batch_size is not None: + props.write_batch_size(write_batch_size) + + if dictionary_pagesize_limit is not None: + props.dictionary_pagesize_limit(dictionary_pagesize_limit) + + # encryption + + if encryption_properties is not None: + props.encryption( + (encryption_properties).unwrap()) + + # For backwards compatibility reasons we cap the maximum row group size + # at 64Mi rows. This could be changed in the future, though it would be + # a breaking change. + # + # The user can always specify a smaller row group size (and the default + # is smaller) when calling write_table. If the call to write_table uses + # a size larger than this then it will be latched to this value. + props.max_row_group_length(_MAX_ROW_GROUP_SIZE) + + # checksum + + if write_page_checksum: + props.enable_page_checksum() + else: + props.disable_page_checksum() + + # page index + + if write_page_index: + props.enable_write_page_index() + else: + props.disable_write_page_index() + + properties = props.build() + + return properties + + +cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( + use_deprecated_int96_timestamps=False, + coerce_timestamps=None, + allow_truncated_timestamps=False, + writer_engine_version=None, + use_compliant_nested_type=True, + store_schema=True) except *: + """Arrow writer properties""" + cdef: + shared_ptr[ArrowWriterProperties] arrow_properties + ArrowWriterProperties.Builder arrow_props + + # Store the original Arrow schema so things like dictionary types can + # be automatically reconstructed + if store_schema: + arrow_props.store_schema() + + # int96 support + + if use_deprecated_int96_timestamps: + arrow_props.enable_deprecated_int96_timestamps() + else: + arrow_props.disable_deprecated_int96_timestamps() + + # coerce_timestamps + + if coerce_timestamps == 'ms': + arrow_props.coerce_timestamps(TimeUnit_MILLI) + elif coerce_timestamps == 'us': + arrow_props.coerce_timestamps(TimeUnit_MICRO) + elif coerce_timestamps is not None: + raise ValueError('Invalid value for coerce_timestamps: {0}' + .format(coerce_timestamps)) + + # allow_truncated_timestamps + + if allow_truncated_timestamps: + arrow_props.allow_truncated_timestamps() + else: + arrow_props.disallow_truncated_timestamps() + + # use_compliant_nested_type + + if use_compliant_nested_type: + arrow_props.enable_compliant_nested_types() + else: + arrow_props.disable_compliant_nested_types() + + # writer_engine_version + + if writer_engine_version == "V1": + warnings.warn("V1 parquet writer engine is a no-op. Use V2.") + arrow_props.set_engine_version(ArrowWriterEngineVersion.V1) + elif writer_engine_version != "V2": + raise ValueError("Unsupported Writer Engine Version: {0}" + .format(writer_engine_version)) + + arrow_properties = arrow_props.build() + + return arrow_properties + +cdef _name_to_index_map(Schema arrow_schema): + cdef: + shared_ptr[CSchema] sp_arrow_schema + shared_ptr[SchemaDescriptor] sp_parquet_schema + shared_ptr[WriterProperties] props = _create_writer_properties() + shared_ptr[ArrowWriterProperties] arrow_props = _create_arrow_writer_properties( + use_deprecated_int96_timestamps=False, + coerce_timestamps=None, + allow_truncated_timestamps=False, + writer_engine_version="V2" + ) + + sp_arrow_schema = pyarrow_unwrap_schema(arrow_schema) + + with nogil: + check_status(ToParquetSchema( + sp_arrow_schema.get(), deref(props.get()), deref(arrow_props.get()), &sp_parquet_schema)) + + out = dict() + + cdef SchemaDescriptor* parquet_schema = sp_parquet_schema.get() + + for i in range(parquet_schema.num_columns()): + name = frombytes(parquet_schema.Column(i).path().get().ToDotString()) + out[name] = i + + return out + + +cdef class ParquetWriter(_Weakrefable): + cdef: + unique_ptr[FileWriter] writer + shared_ptr[COutputStream] sink + bint own_sink + + cdef readonly: + object use_dictionary + object use_deprecated_int96_timestamps + object use_byte_stream_split + object column_encoding + object coerce_timestamps + object allow_truncated_timestamps + object compression + object compression_level + object data_page_version + object use_compliant_nested_type + object version + object write_statistics + object writer_engine_version + int row_group_size + int64_t data_page_size + FileEncryptionProperties encryption_properties + int64_t write_batch_size + int64_t dictionary_pagesize_limit + object store_schema + object store_decimal_as_integer + + def __cinit__(self, where, Schema schema not None, use_dictionary=None, + compression=None, version=None, + write_statistics=None, + MemoryPool memory_pool=None, + use_deprecated_int96_timestamps=False, + coerce_timestamps=None, + data_page_size=None, + allow_truncated_timestamps=False, + compression_level=None, + use_byte_stream_split=False, + column_encoding=None, + writer_engine_version=None, + data_page_version=None, + use_compliant_nested_type=True, + encryption_properties=None, + write_batch_size=None, + dictionary_pagesize_limit=None, + store_schema=True, + write_page_index=False, + write_page_checksum=False, + sorting_columns=None, + store_decimal_as_integer=False): + cdef: + shared_ptr[WriterProperties] properties + shared_ptr[ArrowWriterProperties] arrow_properties + c_string c_where + CMemoryPool* pool + + try: + where = _stringify_path(where) + except TypeError: + get_writer(where, &self.sink) + self.own_sink = False + else: + c_where = tobytes(where) + with nogil: + self.sink = GetResultValue(FileOutputStream.Open(c_where)) + self.own_sink = True + + properties = _create_writer_properties( + use_dictionary=use_dictionary, + compression=compression, + version=version, + write_statistics=write_statistics, + data_page_size=data_page_size, + compression_level=compression_level, + use_byte_stream_split=use_byte_stream_split, + column_encoding=column_encoding, + data_page_version=data_page_version, + encryption_properties=encryption_properties, + write_batch_size=write_batch_size, + dictionary_pagesize_limit=dictionary_pagesize_limit, + write_page_index=write_page_index, + write_page_checksum=write_page_checksum, + sorting_columns=sorting_columns, + store_decimal_as_integer=store_decimal_as_integer, + ) + arrow_properties = _create_arrow_writer_properties( + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, + coerce_timestamps=coerce_timestamps, + allow_truncated_timestamps=allow_truncated_timestamps, + writer_engine_version=writer_engine_version, + use_compliant_nested_type=use_compliant_nested_type, + store_schema=store_schema, + ) + + pool = maybe_unbox_memory_pool(memory_pool) + with nogil: + self.writer = move(GetResultValue( + FileWriter.Open(deref(schema.schema), pool, + self.sink, properties, arrow_properties))) + + def close(self): + with nogil: + check_status(self.writer.get().Close()) + if self.own_sink: + check_status(self.sink.get().Close()) + + def write_table(self, Table table, row_group_size=None): + cdef: + CTable* ctable = table.table + int64_t c_row_group_size + + if row_group_size is None or row_group_size == -1: + c_row_group_size = min(ctable.num_rows(), _DEFAULT_ROW_GROUP_SIZE) + elif row_group_size == 0: + raise ValueError('Row group size cannot be 0') + else: + c_row_group_size = row_group_size + + with nogil: + check_status(self.writer.get() + .WriteTable(deref(ctable), c_row_group_size)) + + def add_key_value_metadata(self, key_value_metadata): + cdef: + shared_ptr[const CKeyValueMetadata] c_metadata + + c_metadata = pyarrow_unwrap_metadata(KeyValueMetadata(key_value_metadata)) + with nogil: + check_status(self.writer.get() + .AddKeyValueMetadata(c_metadata)) + + @property + def metadata(self): + cdef: + shared_ptr[CFileMetaData] metadata + FileMetaData result + with nogil: + metadata = self.writer.get().metadata() + if metadata: + result = FileMetaData() + result.init(metadata) + return result + raise RuntimeError( + 'file metadata is only available after writer close') diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..4cdabfbc53acba4f41f16fff6f7b34054cb69f1a Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.pxd new file mode 100644 index 0000000000000000000000000000000000000000..d52669501a4044838e576d3dac8f8a422874eaa6 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.pxd @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ +# cython: language_level = 3 + +from pyarrow.includes.common cimport * +from pyarrow.includes.libparquet_encryption cimport * +from pyarrow._parquet cimport (ParquetCipher, + CFileEncryptionProperties, + CFileDecryptionProperties, + FileEncryptionProperties, + FileDecryptionProperties, + ParquetCipher_AES_GCM_V1, + ParquetCipher_AES_GCM_CTR_V1) +from pyarrow.lib cimport _Weakrefable + +cdef class CryptoFactory(_Weakrefable): + cdef shared_ptr[CPyCryptoFactory] factory + cdef init(self, callable_client_factory) + cdef inline shared_ptr[CPyCryptoFactory] unwrap(self) + +cdef class EncryptionConfiguration(_Weakrefable): + cdef shared_ptr[CEncryptionConfiguration] configuration + cdef inline shared_ptr[CEncryptionConfiguration] unwrap(self) nogil + +cdef class DecryptionConfiguration(_Weakrefable): + cdef shared_ptr[CDecryptionConfiguration] configuration + cdef inline shared_ptr[CDecryptionConfiguration] unwrap(self) nogil + +cdef class KmsConnectionConfig(_Weakrefable): + cdef shared_ptr[CKmsConnectionConfig] configuration + cdef inline shared_ptr[CKmsConnectionConfig] unwrap(self) nogil + + @staticmethod + cdef wrap(const CKmsConnectionConfig& config) + + +cdef shared_ptr[CCryptoFactory] pyarrow_unwrap_cryptofactory(object crypto_factory) except * +cdef shared_ptr[CKmsConnectionConfig] pyarrow_unwrap_kmsconnectionconfig(object kmsconnectionconfig) except * +cdef shared_ptr[CEncryptionConfiguration] pyarrow_unwrap_encryptionconfig(object encryptionconfig) except * +cdef shared_ptr[CDecryptionConfiguration] pyarrow_unwrap_decryptionconfig(object decryptionconfig) except * diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.pyx new file mode 100644 index 0000000000000000000000000000000000000000..d0a9a6612328c547bc724d6fcf2d37ae5e7badd3 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_parquet_encryption.pyx @@ -0,0 +1,484 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ + +from datetime import timedelta + +from cython.operator cimport dereference as deref +from libcpp.memory cimport shared_ptr +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.lib cimport _Weakrefable +from pyarrow.lib import tobytes, frombytes + + +cdef ParquetCipher cipher_from_name(name): + name = name.upper() + if name == 'AES_GCM_V1': + return ParquetCipher_AES_GCM_V1 + elif name == 'AES_GCM_CTR_V1': + return ParquetCipher_AES_GCM_CTR_V1 + else: + raise ValueError(f'Invalid cipher name: {name!r}') + + +cdef cipher_to_name(ParquetCipher cipher): + if ParquetCipher_AES_GCM_V1 == cipher: + return 'AES_GCM_V1' + elif ParquetCipher_AES_GCM_CTR_V1 == cipher: + return 'AES_GCM_CTR_V1' + else: + raise ValueError('Invalid cipher value: {0}'.format(cipher)) + +cdef class EncryptionConfiguration(_Weakrefable): + """Configuration of the encryption, such as which columns to encrypt""" + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, footer_key, *, column_keys=None, + encryption_algorithm=None, + plaintext_footer=None, double_wrapping=None, + cache_lifetime=None, internal_key_material=None, + data_key_length_bits=None): + self.configuration.reset( + new CEncryptionConfiguration(tobytes(footer_key))) + if column_keys is not None: + self.column_keys = column_keys + if encryption_algorithm is not None: + self.encryption_algorithm = encryption_algorithm + if plaintext_footer is not None: + self.plaintext_footer = plaintext_footer + if double_wrapping is not None: + self.double_wrapping = double_wrapping + if cache_lifetime is not None: + self.cache_lifetime = cache_lifetime + if internal_key_material is not None: + self.internal_key_material = internal_key_material + if data_key_length_bits is not None: + self.data_key_length_bits = data_key_length_bits + + @property + def footer_key(self): + """ID of the master key for footer encryption/signing""" + return frombytes(self.configuration.get().footer_key) + + @property + def column_keys(self): + """ + List of columns to encrypt, with master key IDs. + """ + column_keys_str = frombytes(self.configuration.get().column_keys) + # Convert from "masterKeyID:colName,colName;masterKeyID:colName..." + # (see HIVE-21848) to dictionary of master key ID to column name lists + column_keys_to_key_list_str = dict(subString.replace(" ", "").split( + ":") for subString in column_keys_str.split(";")) + column_keys_dict = {k: v.split( + ",") for k, v in column_keys_to_key_list_str.items()} + return column_keys_dict + + @column_keys.setter + def column_keys(self, dict value): + if value is not None: + # convert a dictionary such as + # '{"key1": ["col1 ", "col2"], "key2": ["col3 ", "col4"]}'' + # to the string defined by the spec + # 'key1: col1 , col2; key2: col3 , col4' + column_keys = "; ".join( + ["{}: {}".format(k, ", ".join(v)) for k, v in value.items()]) + self.configuration.get().column_keys = tobytes(column_keys) + + @property + def encryption_algorithm(self): + """Parquet encryption algorithm. + Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1".""" + return cipher_to_name(self.configuration.get().encryption_algorithm) + + @encryption_algorithm.setter + def encryption_algorithm(self, value): + cipher = cipher_from_name(value) + self.configuration.get().encryption_algorithm = cipher + + @property + def plaintext_footer(self): + """Write files with plaintext footer.""" + return self.configuration.get().plaintext_footer + + @plaintext_footer.setter + def plaintext_footer(self, value): + self.configuration.get().plaintext_footer = value + + @property + def double_wrapping(self): + """Use double wrapping - where data encryption keys (DEKs) are + encrypted with key encryption keys (KEKs), which in turn are + encrypted with master keys. + If set to false, use single wrapping - where DEKs are + encrypted directly with master keys.""" + return self.configuration.get().double_wrapping + + @double_wrapping.setter + def double_wrapping(self, value): + self.configuration.get().double_wrapping = value + + @property + def cache_lifetime(self): + """Lifetime of cached entities (key encryption keys, + local wrapping keys, KMS client objects).""" + return timedelta( + seconds=self.configuration.get().cache_lifetime_seconds) + + @cache_lifetime.setter + def cache_lifetime(self, value): + if not isinstance(value, timedelta): + raise TypeError("cache_lifetime should be a timedelta") + self.configuration.get().cache_lifetime_seconds = value.total_seconds() + + @property + def internal_key_material(self): + """Store key material inside Parquet file footers; this mode doesn’t + produce additional files. If set to false, key material is stored in + separate files in the same folder, which enables key rotation for + immutable Parquet files.""" + return self.configuration.get().internal_key_material + + @internal_key_material.setter + def internal_key_material(self, value): + self.configuration.get().internal_key_material = value + + @property + def data_key_length_bits(self): + """Length of data encryption keys (DEKs), randomly generated by parquet key + management tools. Can be 128, 192 or 256 bits.""" + return self.configuration.get().data_key_length_bits + + @data_key_length_bits.setter + def data_key_length_bits(self, value): + self.configuration.get().data_key_length_bits = value + + cdef inline shared_ptr[CEncryptionConfiguration] unwrap(self) nogil: + return self.configuration + + +cdef class DecryptionConfiguration(_Weakrefable): + """Configuration of the decryption, such as cache timeout.""" + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, *, cache_lifetime=None): + self.configuration.reset(new CDecryptionConfiguration()) + + @property + def cache_lifetime(self): + """Lifetime of cached entities (key encryption keys, + local wrapping keys, KMS client objects).""" + return timedelta( + seconds=self.configuration.get().cache_lifetime_seconds) + + @cache_lifetime.setter + def cache_lifetime(self, value): + self.configuration.get().cache_lifetime_seconds = value.total_seconds() + + cdef inline shared_ptr[CDecryptionConfiguration] unwrap(self) nogil: + return self.configuration + + +cdef class KmsConnectionConfig(_Weakrefable): + """Configuration of the connection to the Key Management Service (KMS)""" + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, *, kms_instance_id=None, kms_instance_url=None, + key_access_token=None, custom_kms_conf=None): + self.configuration.reset(new CKmsConnectionConfig()) + if kms_instance_id is not None: + self.kms_instance_id = kms_instance_id + if kms_instance_url is not None: + self.kms_instance_url = kms_instance_url + if key_access_token is None: + self.key_access_token = b'DEFAULT' + else: + self.key_access_token = key_access_token + if custom_kms_conf is not None: + self.custom_kms_conf = custom_kms_conf + + @property + def kms_instance_id(self): + """ID of the KMS instance that will be used for encryption + (if multiple KMS instances are available).""" + return frombytes(self.configuration.get().kms_instance_id) + + @kms_instance_id.setter + def kms_instance_id(self, value): + self.configuration.get().kms_instance_id = tobytes(value) + + @property + def kms_instance_url(self): + """URL of the KMS instance.""" + return frombytes(self.configuration.get().kms_instance_url) + + @kms_instance_url.setter + def kms_instance_url(self, value): + self.configuration.get().kms_instance_url = tobytes(value) + + @property + def key_access_token(self): + """Authorization token that will be passed to KMS.""" + return frombytes(self.configuration.get() + .refreshable_key_access_token.get().value()) + + @key_access_token.setter + def key_access_token(self, value): + self.refresh_key_access_token(value) + + @property + def custom_kms_conf(self): + """A dictionary with KMS-type-specific configuration""" + custom_kms_conf = { + frombytes(k): frombytes(v) + for k, v in self.configuration.get().custom_kms_conf + } + return custom_kms_conf + + @custom_kms_conf.setter + def custom_kms_conf(self, dict value): + if value is not None: + for k, v in value.items(): + if isinstance(k, str) and isinstance(v, str): + self.configuration.get().custom_kms_conf[tobytes(k)] = \ + tobytes(v) + else: + raise TypeError("Expected custom_kms_conf to be " + + "a dictionary of strings") + + def refresh_key_access_token(self, value): + cdef: + shared_ptr[CKeyAccessToken] c_key_access_token = \ + self.configuration.get().refreshable_key_access_token + + c_key_access_token.get().Refresh(tobytes(value)) + + cdef inline shared_ptr[CKmsConnectionConfig] unwrap(self) nogil: + return self.configuration + + @staticmethod + cdef wrap(const CKmsConnectionConfig& config): + result = KmsConnectionConfig() + result.configuration = make_shared[CKmsConnectionConfig](move(config)) + return result + + +# Callback definitions for CPyKmsClientVtable +cdef void _cb_wrap_key( + handler, const c_string& key_bytes, + const c_string& master_key_identifier, c_string* out) except *: + mkid_str = frombytes(master_key_identifier) + wrapped_key = handler.wrap_key(key_bytes, mkid_str) + out[0] = tobytes(wrapped_key) + + +cdef void _cb_unwrap_key( + handler, const c_string& wrapped_key, + const c_string& master_key_identifier, c_string* out) except *: + mkid_str = frombytes(master_key_identifier) + wk_str = frombytes(wrapped_key) + key = handler.unwrap_key(wk_str, mkid_str) + out[0] = tobytes(key) + + +cdef class KmsClient(_Weakrefable): + """The abstract base class for KmsClient implementations.""" + cdef: + shared_ptr[CKmsClient] client + + def __init__(self): + self.init() + + cdef init(self): + cdef: + CPyKmsClientVtable vtable = CPyKmsClientVtable() + + vtable.wrap_key = _cb_wrap_key + vtable.unwrap_key = _cb_unwrap_key + + self.client.reset(new CPyKmsClient(self, vtable)) + + def wrap_key(self, key_bytes, master_key_identifier): + """Wrap a key - encrypt it with the master key.""" + raise NotImplementedError() + + def unwrap_key(self, wrapped_key, master_key_identifier): + """Unwrap a key - decrypt it with the master key.""" + raise NotImplementedError() + + cdef inline shared_ptr[CKmsClient] unwrap(self) nogil: + return self.client + + +# Callback definition for CPyKmsClientFactoryVtable +cdef void _cb_create_kms_client( + handler, + const CKmsConnectionConfig& kms_connection_config, + shared_ptr[CKmsClient]* out) except *: + connection_config = KmsConnectionConfig.wrap(kms_connection_config) + + result = handler(connection_config) + if not isinstance(result, KmsClient): + raise TypeError( + "callable must return KmsClient instances, but got {}".format( + type(result))) + + out[0] = ( result).unwrap() + + +cdef class CryptoFactory(_Weakrefable): + """ A factory that produces the low-level FileEncryptionProperties and + FileDecryptionProperties objects, from the high-level parameters.""" + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, kms_client_factory): + """Create CryptoFactory. + + Parameters + ---------- + kms_client_factory : a callable that accepts KmsConnectionConfig + and returns a KmsClient + """ + self.factory.reset(new CPyCryptoFactory()) + + if callable(kms_client_factory): + self.init(kms_client_factory) + else: + raise TypeError("Parameter kms_client_factory must be a callable") + + cdef init(self, callable_client_factory): + cdef: + CPyKmsClientFactoryVtable vtable + shared_ptr[CPyKmsClientFactory] kms_client_factory + + vtable.create_kms_client = _cb_create_kms_client + kms_client_factory.reset( + new CPyKmsClientFactory(callable_client_factory, vtable)) + # A KmsClientFactory object must be registered + # via this method before calling any of + # file_encryption_properties()/file_decryption_properties() methods. + self.factory.get().RegisterKmsClientFactory( + static_pointer_cast[CKmsClientFactory, CPyKmsClientFactory]( + kms_client_factory)) + + def file_encryption_properties(self, + KmsConnectionConfig kms_connection_config, + EncryptionConfiguration encryption_config): + """Create file encryption properties. + + Parameters + ---------- + kms_connection_config : KmsConnectionConfig + Configuration of connection to KMS + + encryption_config : EncryptionConfiguration + Configuration of the encryption, such as which columns to encrypt + + Returns + ------- + file_encryption_properties : FileEncryptionProperties + File encryption properties. + """ + cdef: + CResult[shared_ptr[CFileEncryptionProperties]] \ + file_encryption_properties_result + with nogil: + file_encryption_properties_result = \ + self.factory.get().SafeGetFileEncryptionProperties( + deref(kms_connection_config.unwrap().get()), + deref(encryption_config.unwrap().get())) + file_encryption_properties = GetResultValue( + file_encryption_properties_result) + return FileEncryptionProperties.wrap(file_encryption_properties) + + def file_decryption_properties( + self, + KmsConnectionConfig kms_connection_config, + DecryptionConfiguration decryption_config=None): + """Create file decryption properties. + + Parameters + ---------- + kms_connection_config : KmsConnectionConfig + Configuration of connection to KMS + + decryption_config : DecryptionConfiguration, default None + Configuration of the decryption, such as cache timeout. + Can be None. + + Returns + ------- + file_decryption_properties : FileDecryptionProperties + File decryption properties. + """ + cdef: + CDecryptionConfiguration c_decryption_config + CResult[shared_ptr[CFileDecryptionProperties]] \ + c_file_decryption_properties + if decryption_config is None: + c_decryption_config = CDecryptionConfiguration() + else: + c_decryption_config = deref(decryption_config.unwrap().get()) + with nogil: + c_file_decryption_properties = \ + self.factory.get().SafeGetFileDecryptionProperties( + deref(kms_connection_config.unwrap().get()), + c_decryption_config) + file_decryption_properties = GetResultValue( + c_file_decryption_properties) + return FileDecryptionProperties.wrap(file_decryption_properties) + + def remove_cache_entries_for_token(self, access_token): + self.factory.get().RemoveCacheEntriesForToken(tobytes(access_token)) + + def remove_cache_entries_for_all_tokens(self): + self.factory.get().RemoveCacheEntriesForAllTokens() + + cdef inline shared_ptr[CPyCryptoFactory] unwrap(self): + return self.factory + + +cdef shared_ptr[CCryptoFactory] pyarrow_unwrap_cryptofactory(object crypto_factory) except *: + if isinstance(crypto_factory, CryptoFactory): + pycf = ( crypto_factory).unwrap() + return static_pointer_cast[CCryptoFactory, CPyCryptoFactory](pycf) + raise TypeError("Expected CryptoFactory, got %s" % type(crypto_factory)) + + +cdef shared_ptr[CKmsConnectionConfig] pyarrow_unwrap_kmsconnectionconfig(object kmsconnectionconfig) except *: + if isinstance(kmsconnectionconfig, KmsConnectionConfig): + return ( kmsconnectionconfig).unwrap() + raise TypeError("Expected KmsConnectionConfig, got %s" % type(kmsconnectionconfig)) + + +cdef shared_ptr[CEncryptionConfiguration] pyarrow_unwrap_encryptionconfig(object encryptionconfig) except *: + if isinstance(encryptionconfig, EncryptionConfiguration): + return ( encryptionconfig).unwrap() + raise TypeError("Expected EncryptionConfiguration, got %s" % type(encryptionconfig)) + + +cdef shared_ptr[CDecryptionConfiguration] pyarrow_unwrap_decryptionconfig(object decryptionconfig) except *: + if isinstance(decryptionconfig, DecryptionConfiguration): + return ( decryptionconfig).unwrap() + raise TypeError("Expected DecryptionConfiguration, got %s" % type(decryptionconfig)) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..7a83bce25ee9cf1282aa523fed0007d33dde78ee Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.pxd b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.pxd new file mode 100644 index 0000000000000000000000000000000000000000..91c0220d7310870a7803ecceb2c32b8b32f8c11d --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.pxd @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ +# cython: language_level = 3 + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport CStatus + + +ctypedef CStatus cb_test_func() + +cdef extern from "arrow/python/python_test.h" namespace "arrow::py::testing" nogil: + + cdef cppclass CTestCase "arrow::py::testing::TestCase": + c_string name + cb_test_func func + + vector[CTestCase] GetCppTestCases() diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.pyx new file mode 100644 index 0000000000000000000000000000000000000000..adb148351306c02667346b3750c08f2efd8a6625 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_pyarrow_cpp_tests.pyx @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False, binding=True +# distutils: language = c++ + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.lib cimport check_status + +from pyarrow.lib import frombytes + + +cdef class CppTestCase: + """ + A simple wrapper for a C++ test case. + """ + cdef: + CTestCase c_case + + @staticmethod + cdef wrap(CTestCase c_case): + cdef: + CppTestCase obj + obj = CppTestCase.__new__(CppTestCase) + obj.c_case = c_case + return obj + + @property + def name(self): + return frombytes(self.c_case.name) + + def __repr__(self): + return f"<{self.__class__.__name__} {self.name!r}>" + + def __call__(self): + check_status(self.c_case.func()) + + +def get_cpp_tests(): + """ + Get a list of C++ test cases. + """ + cases = [] + c_cases = GetCppTestCases() + for c_case in c_cases: + cases.append(CppTestCase.wrap(c_case)) + return cases diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_s3fs.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_s3fs.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..7daa81e1aedebc6c3b49696a69e0adfde3396c1a Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_s3fs.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_s3fs.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_s3fs.pyx new file mode 100644 index 0000000000000000000000000000000000000000..ba6603322838dd14400ecc0cc71ac20340a4a83a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_s3fs.pyx @@ -0,0 +1,479 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from cython cimport binding + +from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, + pyarrow_unwrap_metadata) +from pyarrow.lib import frombytes, tobytes, KeyValueMetadata +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow._fs cimport FileSystem + + +cpdef enum S3LogLevel: + Off = CS3LogLevel_Off + Fatal = CS3LogLevel_Fatal + Error = CS3LogLevel_Error + Warn = CS3LogLevel_Warn + Info = CS3LogLevel_Info + Debug = CS3LogLevel_Debug + Trace = CS3LogLevel_Trace + + +def initialize_s3(S3LogLevel log_level=S3LogLevel.Fatal, int num_event_loop_threads=1): + """ + Initialize S3 support + + Parameters + ---------- + log_level : S3LogLevel + level of logging + num_event_loop_threads : int, default 1 + how many threads to use for the AWS SDK's I/O event loop + + Examples + -------- + >>> fs.initialize_s3(fs.S3LogLevel.Error) # doctest: +SKIP + """ + cdef CS3GlobalOptions options + options.log_level = log_level + options.num_event_loop_threads = num_event_loop_threads + check_status(CInitializeS3(options)) + + +def ensure_s3_initialized(): + """ + Initialize S3 (with default options) if not already initialized + """ + check_status(CEnsureS3Initialized()) + + +def finalize_s3(): + check_status(CFinalizeS3()) + + +def ensure_s3_finalized(): + """ + Finalize S3 if already initialized + """ + check_status(CEnsureS3Finalized()) + + +def resolve_s3_region(bucket): + """ + Resolve the S3 region of a bucket. + + Parameters + ---------- + bucket : str + A S3 bucket name + + Returns + ------- + region : str + A S3 region name + + Examples + -------- + >>> fs.resolve_s3_region('voltrondata-labs-datasets') + 'us-east-2' + """ + cdef: + c_string c_bucket + c_string c_region + + ensure_s3_initialized() + + c_bucket = tobytes(bucket) + with nogil: + c_region = GetResultValue(ResolveS3BucketRegion(c_bucket)) + + return frombytes(c_region) + + +class S3RetryStrategy: + """ + Base class for AWS retry strategies for use with S3. + + Parameters + ---------- + max_attempts : int, default 3 + The maximum number of retry attempts to attempt before failing. + """ + + def __init__(self, max_attempts=3): + self.max_attempts = max_attempts + + +class AwsStandardS3RetryStrategy(S3RetryStrategy): + """ + Represents an AWS Standard retry strategy for use with S3. + + Parameters + ---------- + max_attempts : int, default 3 + The maximum number of retry attempts to attempt before failing. + """ + pass + + +class AwsDefaultS3RetryStrategy(S3RetryStrategy): + """ + Represents an AWS Default retry strategy for use with S3. + + Parameters + ---------- + max_attempts : int, default 3 + The maximum number of retry attempts to attempt before failing. + """ + pass + + +cdef class S3FileSystem(FileSystem): + """ + S3-backed FileSystem implementation + + AWS access_key and secret_key can be provided explicitly. + + If role_arn is provided instead of access_key and secret_key, temporary + credentials will be fetched by issuing a request to STS to assume the + specified role. + + If neither access_key nor secret_key are provided, and role_arn is also not + provided, then attempts to establish the credentials automatically. + S3FileSystem will try the following methods, in order: + + * ``AWS_ACCESS_KEY_ID``, ``AWS_SECRET_ACCESS_KEY``, and ``AWS_SESSION_TOKEN`` environment variables + * configuration files such as ``~/.aws/credentials`` and ``~/.aws/config`` + * for nodes on Amazon EC2, the EC2 Instance Metadata Service + + Note: S3 buckets are special and the operations available on them may be + limited or more expensive than desired. + + When S3FileSystem creates new buckets (assuming allow_bucket_creation is + True), it does not pass any non-default settings. In AWS S3, the bucket and + all objects will be not publicly visible, and will have no bucket policies + and no resource tags. To have more control over how buckets are created, + use a different API to create them. + + Parameters + ---------- + access_key : str, default None + AWS Access Key ID. Pass None to use the standard AWS environment + variables and/or configuration file. + secret_key : str, default None + AWS Secret Access key. Pass None to use the standard AWS environment + variables and/or configuration file. + session_token : str, default None + AWS Session Token. An optional session token, required if access_key + and secret_key are temporary credentials from STS. + anonymous : bool, default False + Whether to connect anonymously if access_key and secret_key are None. + If true, will not attempt to look up credentials using standard AWS + configuration methods. + role_arn : str, default None + AWS Role ARN. If provided instead of access_key and secret_key, + temporary credentials will be fetched by assuming this role. + session_name : str, default None + An optional identifier for the assumed role session. + external_id : str, default None + An optional unique identifier that might be required when you assume + a role in another account. + load_frequency : int, default 900 + The frequency (in seconds) with which temporary credentials from an + assumed role session will be refreshed. + region : str, default None + AWS region to connect to. If not set, the AWS SDK will attempt to + determine the region using heuristics such as environment variables, + configuration profile, EC2 metadata, or default to 'us-east-1' when SDK + version <1.8. One can also use :func:`pyarrow.fs.resolve_s3_region` to + automatically resolve the region from a bucket name. + request_timeout : double, default None + Socket read timeouts on Windows and macOS, in seconds. + If omitted, the AWS SDK default value is used (typically 3 seconds). + This option is ignored on non-Windows, non-macOS systems. + connect_timeout : double, default None + Socket connection timeout, in seconds. + If omitted, the AWS SDK default value is used (typically 1 second). + scheme : str, default 'https' + S3 connection transport scheme. + endpoint_override : str, default None + Override region with a connect string such as "localhost:9000" + background_writes : bool, default True + Whether file writes will be issued in the background, without + blocking. + default_metadata : mapping or pyarrow.KeyValueMetadata, default None + Default metadata for open_output_stream. This will be ignored if + non-empty metadata is passed to open_output_stream. + proxy_options : dict or str, default None + If a proxy is used, provide the options here. Supported options are: + 'scheme' (str: 'http' or 'https'; required), 'host' (str; required), + 'port' (int; required), 'username' (str; optional), + 'password' (str; optional). + A proxy URI (str) can also be provided, in which case these options + will be derived from the provided URI. + The following are equivalent:: + + S3FileSystem(proxy_options='http://username:password@localhost:8020') + S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost', + 'port': 8020, 'username': 'username', + 'password': 'password'}) + allow_bucket_creation : bool, default False + Whether to allow directory creation at the bucket-level. This option may also be + passed in a URI query parameter. + allow_bucket_deletion : bool, default False + Whether to allow directory deletion at the bucket-level. This option may also be + passed in a URI query parameter. + check_directory_existence_before_creation : bool, default false + Whether to check the directory existence before creating it. + If false, when creating a directory the code will not check if it already + exists or not. It's an optimization to try directory creation and catch the error, + rather than issue two dependent I/O calls. + If true, when creating a directory the code will only create the directory when necessary + at the cost of extra I/O calls. This can be used for key/value cloud storage which has + a hard rate limit to number of object mutation operations or scenerios such as + the directories already exist and you do not have creation access. + retry_strategy : S3RetryStrategy, default AwsStandardS3RetryStrategy(max_attempts=3) + The retry strategy to use with S3; fail after max_attempts. Available + strategies are AwsStandardS3RetryStrategy, AwsDefaultS3RetryStrategy. + force_virtual_addressing : bool, default False + Whether to use virtual addressing of buckets. + If true, then virtual addressing is always enabled. + If false, then virtual addressing is only enabled if `endpoint_override` is empty. + This can be used for non-AWS backends that only support virtual hosted-style access. + + Examples + -------- + >>> from pyarrow import fs + >>> s3 = fs.S3FileSystem(region='us-west-2') + >>> s3.get_file_info(fs.FileSelector( + ... 'power-analysis-ready-datastore/power_901_constants.zarr/FROCEAN', recursive=True + ... )) + [ wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.s3fs = wrapped.get() + + @staticmethod + @binding(True) # Required for cython < 3 + def _reconstruct(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + return S3FileSystem(**kwargs) + + def __reduce__(self): + cdef CS3Options opts = self.s3fs.options() + + # if creds were explicitly provided, then use them + # else obtain them as they were last time. + if opts.credentials_kind == CS3CredentialsKind_Explicit: + access_key = frombytes(opts.GetAccessKey()) + secret_key = frombytes(opts.GetSecretKey()) + session_token = frombytes(opts.GetSessionToken()) + else: + access_key = None + secret_key = None + session_token = None + + return ( + S3FileSystem._reconstruct, (dict( + access_key=access_key, + secret_key=secret_key, + session_token=session_token, + anonymous=(opts.credentials_kind == + CS3CredentialsKind_Anonymous), + region=frombytes(opts.region), + scheme=frombytes(opts.scheme), + connect_timeout=opts.connect_timeout, + request_timeout=opts.request_timeout, + endpoint_override=frombytes(opts.endpoint_override), + role_arn=frombytes(opts.role_arn), + session_name=frombytes(opts.session_name), + external_id=frombytes(opts.external_id), + load_frequency=opts.load_frequency, + background_writes=opts.background_writes, + allow_bucket_creation=opts.allow_bucket_creation, + allow_bucket_deletion=opts.allow_bucket_deletion, + check_directory_existence_before_creation=opts.check_directory_existence_before_creation, + default_metadata=pyarrow_wrap_metadata(opts.default_metadata), + proxy_options={'scheme': frombytes(opts.proxy_options.scheme), + 'host': frombytes(opts.proxy_options.host), + 'port': opts.proxy_options.port, + 'username': frombytes( + opts.proxy_options.username), + 'password': frombytes( + opts.proxy_options.password)}, + force_virtual_addressing=opts.force_virtual_addressing, + ),) + ) + + @property + def region(self): + """ + The AWS region this filesystem connects to. + """ + return frombytes(self.s3fs.region()) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_substrait.cpython-312-x86_64-linux-gnu.so b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_substrait.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..aac373d23a998fa2b37df50ec4687dff670a9e47 Binary files /dev/null and b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_substrait.cpython-312-x86_64-linux-gnu.so differ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_substrait.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_substrait.pyx new file mode 100644 index 0000000000000000000000000000000000000000..067cb5f91681bacf430945bc5aec2bb04e0cb01b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/_substrait.pyx @@ -0,0 +1,349 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 +from cython.operator cimport dereference as deref +from libcpp.vector cimport vector as std_vector + +from pyarrow import Buffer, py_buffer +from pyarrow._compute cimport Expression +from pyarrow.lib import frombytes, tobytes +from pyarrow.lib cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_substrait cimport * + + +# TODO GH-37235: Fix exception handling +cdef CDeclaration _create_named_table_provider( + dict named_args, const std_vector[c_string]& names, const CSchema& schema +) noexcept: + cdef: + c_string c_name + shared_ptr[CTable] c_in_table + shared_ptr[CTableSourceNodeOptions] c_tablesourceopts + shared_ptr[CExecNodeOptions] c_input_node_opts + vector[CDeclaration.Input] no_c_inputs + + py_names = [] + for i in range(names.size()): + c_name = names[i] + py_names.append(frombytes(c_name)) + py_schema = pyarrow_wrap_schema(make_shared[CSchema](schema)) + + py_table = named_args["provider"](py_names, py_schema) + c_in_table = pyarrow_unwrap_table(py_table) + c_tablesourceopts = make_shared[CTableSourceNodeOptions](c_in_table) + c_input_node_opts = static_pointer_cast[CExecNodeOptions, CTableSourceNodeOptions]( + c_tablesourceopts) + return CDeclaration(tobytes("table_source"), + no_c_inputs, c_input_node_opts) + + +def run_query(plan, *, table_provider=None, use_threads=True): + """ + Execute a Substrait plan and read the results as a RecordBatchReader. + + Parameters + ---------- + plan : Union[Buffer, bytes] + The serialized Substrait plan to execute. + table_provider : object (optional) + A function to resolve any NamedTable relation to a table. + The function will receive two arguments which will be a list + of strings representing the table name and a pyarrow.Schema representing + the expected schema and should return a pyarrow.Table. + use_threads : bool, default True + If True then multiple threads will be used to run the query. If False then + all CPU intensive work will be done on the calling thread. + + Returns + ------- + RecordBatchReader + A reader containing the result of the executed query + + Examples + -------- + >>> import pyarrow as pa + >>> from pyarrow.lib import tobytes + >>> import pyarrow.substrait as substrait + >>> test_table_1 = pa.Table.from_pydict({"x": [1, 2, 3]}) + >>> test_table_2 = pa.Table.from_pydict({"x": [4, 5, 6]}) + >>> def table_provider(names, schema): + ... if not names: + ... raise Exception("No names provided") + ... elif names[0] == "t1": + ... return test_table_1 + ... elif names[1] == "t2": + ... return test_table_2 + ... else: + ... raise Exception("Unrecognized table name") + ... + >>> substrait_query = ''' + ... { + ... "relations": [ + ... {"rel": { + ... "read": { + ... "base_schema": { + ... "struct": { + ... "types": [ + ... {"i64": {}} + ... ] + ... }, + ... "names": [ + ... "x" + ... ] + ... }, + ... "namedTable": { + ... "names": ["t1"] + ... } + ... } + ... }} + ... ] + ... } + ... ''' + >>> buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) + >>> reader = pa.substrait.run_query(buf, table_provider=table_provider) + >>> reader.read_all() + pyarrow.Table + x: int64 + ---- + x: [[1,2,3]] + """ + + cdef: + CResult[shared_ptr[CRecordBatchReader]] c_res_reader + shared_ptr[CRecordBatchReader] c_reader + RecordBatchReader reader + shared_ptr[CBuffer] c_buf_plan + CConversionOptions c_conversion_options + c_bool c_use_threads + + c_use_threads = use_threads + if isinstance(plan, bytes): + c_buf_plan = pyarrow_unwrap_buffer(py_buffer(plan)) + elif isinstance(plan, Buffer): + c_buf_plan = pyarrow_unwrap_buffer(plan) + else: + raise TypeError( + f"Expected 'pyarrow.Buffer' or bytes, got '{type(plan)}'") + + if table_provider is not None: + named_table_args = { + "provider": table_provider + } + c_conversion_options.named_table_provider = BindFunction[CNamedTableProvider]( + &_create_named_table_provider, named_table_args) + + with nogil: + c_res_reader = ExecuteSerializedPlan( + deref(c_buf_plan), default_extension_id_registry(), + GetFunctionRegistry(), c_conversion_options, c_use_threads) + + c_reader = GetResultValue(c_res_reader) + + reader = RecordBatchReader.__new__(RecordBatchReader) + reader.reader = c_reader + return reader + + +def _parse_json_plan(plan): + """ + Parse a JSON plan into equivalent serialized Protobuf. + + Parameters + ---------- + plan : bytes + Substrait plan in JSON. + + Returns + ------- + Buffer + A buffer containing the serialized Protobuf plan. + """ + + cdef: + CResult[shared_ptr[CBuffer]] c_res_buffer + c_string c_str_plan + shared_ptr[CBuffer] c_buf_plan + + c_str_plan = plan + c_res_buffer = SerializeJsonPlan(c_str_plan) + with nogil: + c_buf_plan = GetResultValue(c_res_buffer) + return pyarrow_wrap_buffer(c_buf_plan) + + +def serialize_expressions(exprs, names, schema, *, allow_arrow_extensions=False): + """ + Serialize a collection of expressions into Substrait + + Substrait expressions must be bound to a schema. For example, + the Substrait expression ``a:i32 + b:i32`` is different from the + Substrait expression ``a:i64 + b:i64``. Pyarrow expressions are + typically unbound. For example, both of the above expressions + would be represented as ``a + b`` in pyarrow. + + This means a schema must be provided when serializing an expression. + It also means that the serialization may fail if a matching function + call cannot be found for the expression. + + Parameters + ---------- + exprs : list of Expression + The expressions to serialize + names : list of str + Names for the expressions + schema : Schema + The schema the expressions will be bound to + allow_arrow_extensions : bool, default False + If False then only functions that are part of the core Substrait function + definitions will be allowed. Set this to True to allow pyarrow-specific functions + and user defined functions but the result may not be accepted by other + compute libraries. + + Returns + ------- + Buffer + An ExtendedExpression message containing the serialized expressions + """ + cdef: + CResult[shared_ptr[CBuffer]] c_res_buffer + shared_ptr[CBuffer] c_buffer + CNamedExpression c_named_expr + CBoundExpressions c_bound_exprs + CConversionOptions c_conversion_options + + if len(exprs) != len(names): + raise ValueError("exprs and names need to have the same length") + for expr, name in zip(exprs, names): + if not isinstance(expr, Expression): + raise TypeError(f"Expected Expression, got '{type(expr)}' in exprs") + if not isinstance(name, str): + raise TypeError(f"Expected str, got '{type(name)}' in names") + c_named_expr.expression = ( expr).unwrap() + c_named_expr.name = tobytes( name) + c_bound_exprs.named_expressions.push_back(c_named_expr) + + c_bound_exprs.schema = ( schema).sp_schema + + c_conversion_options.allow_arrow_extensions = allow_arrow_extensions + + with nogil: + c_res_buffer = SerializeExpressions(c_bound_exprs, c_conversion_options) + c_buffer = GetResultValue(c_res_buffer) + return pyarrow_wrap_buffer(c_buffer) + + +cdef class BoundExpressions(_Weakrefable): + """ + A collection of named expressions and the schema they are bound to + + This is equivalent to the Substrait ExtendedExpression message + """ + + cdef: + CBoundExpressions c_bound_exprs + + def __init__(self): + msg = 'BoundExpressions is an abstract class thus cannot be initialized.' + raise TypeError(msg) + + cdef void init(self, CBoundExpressions bound_expressions): + self.c_bound_exprs = bound_expressions + + @property + def schema(self): + """ + The common schema that all expressions are bound to + """ + return pyarrow_wrap_schema(self.c_bound_exprs.schema) + + @property + def expressions(self): + """ + A dict from expression name to expression + """ + expr_dict = {} + for named_expr in self.c_bound_exprs.named_expressions: + name = frombytes(named_expr.name) + expr = Expression.wrap(named_expr.expression) + expr_dict[name] = expr + return expr_dict + + @staticmethod + cdef wrap(const CBoundExpressions& bound_expressions): + cdef BoundExpressions self = BoundExpressions.__new__(BoundExpressions) + self.init(bound_expressions) + return self + + +def deserialize_expressions(buf): + """ + Deserialize an ExtendedExpression Substrait message into a BoundExpressions object + + Parameters + ---------- + buf : Buffer or bytes + The message to deserialize + + Returns + ------- + BoundExpressions + The deserialized expressions, their names, and the bound schema + """ + cdef: + shared_ptr[CBuffer] c_buffer + CResult[CBoundExpressions] c_res_bound_exprs + CBoundExpressions c_bound_exprs + + if isinstance(buf, bytes): + c_buffer = pyarrow_unwrap_buffer(py_buffer(buf)) + elif isinstance(buf, Buffer): + c_buffer = pyarrow_unwrap_buffer(buf) + else: + raise TypeError( + f"Expected 'pyarrow.Buffer' or bytes, got '{type(buf)}'") + + with nogil: + c_res_bound_exprs = DeserializeExpressions(deref(c_buffer)) + c_bound_exprs = GetResultValue(c_res_bound_exprs) + + return BoundExpressions.wrap(c_bound_exprs) + + +def get_supported_functions(): + """ + Get a list of Substrait functions that the underlying + engine currently supports. + + Returns + ------- + list[str] + A list of function ids encoded as '{uri}#{name}' + """ + + cdef: + ExtensionIdRegistry* c_id_registry + std_vector[c_string] c_ids + + c_id_registry = default_extension_id_registry() + c_ids = c_id_registry.GetSupportedSubstraitFunctions() + + functions_list = [] + for c_id in c_ids: + functions_list.append(frombytes(c_id)) + return functions_list diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/acero.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/acero.py new file mode 100644 index 0000000000000000000000000000000000000000..77ba3ab1ce85ddba5b50e2370928fc611ee00478 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/acero.py @@ -0,0 +1,403 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# --------------------------------------------------------------------- +# Implement Internal ExecPlan bindings + +# cython: profile=False +# distutils: language = c++ +# cython: language_level = 3 + +from pyarrow.lib import Table, RecordBatch +from pyarrow.compute import Expression, field + +try: + from pyarrow._acero import ( # noqa + Declaration, + ExecNodeOptions, + TableSourceNodeOptions, + FilterNodeOptions, + ProjectNodeOptions, + AggregateNodeOptions, + OrderByNodeOptions, + HashJoinNodeOptions, + AsofJoinNodeOptions, + ) +except ImportError as exc: + raise ImportError( + f"The pyarrow installation is not built with support for 'acero' ({str(exc)})" + ) from None + + +try: + import pyarrow.dataset as ds + from pyarrow._dataset import ScanNodeOptions +except ImportError: + class DatasetModuleStub: + class Dataset: + pass + + class InMemoryDataset: + pass + ds = DatasetModuleStub + + +def _dataset_to_decl(dataset, use_threads=True): + decl = Declaration("scan", ScanNodeOptions(dataset, use_threads=use_threads)) + + # Get rid of special dataset columns + # "__fragment_index", "__batch_index", "__last_in_fragment", "__filename" + projections = [field(f) for f in dataset.schema.names] + decl = Declaration.from_sequence( + [decl, Declaration("project", ProjectNodeOptions(projections))] + ) + + filter_expr = dataset._scan_options.get("filter") + if filter_expr is not None: + # Filters applied in CScanNodeOptions are "best effort" for the scan node itself + # so we always need to inject an additional Filter node to apply them for real. + decl = Declaration.from_sequence( + [decl, Declaration("filter", FilterNodeOptions(filter_expr))] + ) + + return decl + + +def _perform_join(join_type, left_operand, left_keys, + right_operand, right_keys, + left_suffix=None, right_suffix=None, + use_threads=True, coalesce_keys=False, + output_type=Table): + """ + Perform join of two tables or datasets. + + The result will be an output table with the result of the join operation + + Parameters + ---------- + join_type : str + One of supported join types. + left_operand : Table or Dataset + The left operand for the join operation. + left_keys : str or list[str] + The left key (or keys) on which the join operation should be performed. + right_operand : Table or Dataset + The right operand for the join operation. + right_keys : str or list[str] + The right key (or keys) on which the join operation should be performed. + left_suffix : str, default None + Which suffix to add to left column names. This prevents confusion + when the columns in left and right operands have colliding names. + right_suffix : str, default None + Which suffix to add to the right column names. This prevents confusion + when the columns in left and right operands have colliding names. + use_threads : bool, default True + Whether to use multithreading or not. + coalesce_keys : bool, default False + If the duplicated keys should be omitted from one of the sides + in the join result. + output_type: Table or InMemoryDataset + The output type for the exec plan result. + + Returns + ------- + result_table : Table or InMemoryDataset + """ + if not isinstance(left_operand, (Table, ds.Dataset)): + raise TypeError(f"Expected Table or Dataset, got {type(left_operand)}") + if not isinstance(right_operand, (Table, ds.Dataset)): + raise TypeError(f"Expected Table or Dataset, got {type(right_operand)}") + + # Prepare left and right tables Keys to send them to the C++ function + left_keys_order = {} + if not isinstance(left_keys, (tuple, list)): + left_keys = [left_keys] + for idx, key in enumerate(left_keys): + left_keys_order[key] = idx + + right_keys_order = {} + if not isinstance(right_keys, (list, tuple)): + right_keys = [right_keys] + for idx, key in enumerate(right_keys): + right_keys_order[key] = idx + + # By default expose all columns on both left and right table + left_columns = left_operand.schema.names + right_columns = right_operand.schema.names + + # Pick the join type + if join_type == "left semi" or join_type == "left anti": + right_columns = [] + elif join_type == "right semi" or join_type == "right anti": + left_columns = [] + elif join_type == "inner" or join_type == "left outer": + right_columns = [ + col for col in right_columns if col not in right_keys_order + ] + elif join_type == "right outer": + left_columns = [ + col for col in left_columns if col not in left_keys_order + ] + + # Turn the columns to vectors of FieldRefs + # and set aside indices of keys. + left_column_keys_indices = {} + for idx, colname in enumerate(left_columns): + if colname in left_keys: + left_column_keys_indices[colname] = idx + right_column_keys_indices = {} + for idx, colname in enumerate(right_columns): + if colname in right_keys: + right_column_keys_indices[colname] = idx + + # Add the join node to the execplan + if isinstance(left_operand, ds.Dataset): + left_source = _dataset_to_decl(left_operand, use_threads=use_threads) + else: + left_source = Declaration("table_source", TableSourceNodeOptions(left_operand)) + if isinstance(right_operand, ds.Dataset): + right_source = _dataset_to_decl(right_operand, use_threads=use_threads) + else: + right_source = Declaration( + "table_source", TableSourceNodeOptions(right_operand) + ) + + if coalesce_keys: + join_opts = HashJoinNodeOptions( + join_type, left_keys, right_keys, left_columns, right_columns, + output_suffix_for_left=left_suffix or "", + output_suffix_for_right=right_suffix or "", + ) + else: + join_opts = HashJoinNodeOptions( + join_type, left_keys, right_keys, + output_suffix_for_left=left_suffix or "", + output_suffix_for_right=right_suffix or "", + ) + decl = Declaration( + "hashjoin", options=join_opts, inputs=[left_source, right_source] + ) + + if coalesce_keys and join_type == "full outer": + # In case of full outer joins, the join operation will output all columns + # so that we can coalesce the keys and exclude duplicates in a subsequent + # projection. + left_columns_set = set(left_columns) + right_columns_set = set(right_columns) + # Where the right table columns start. + right_operand_index = len(left_columns) + projected_col_names = [] + projections = [] + for idx, col in enumerate(left_columns + right_columns): + if idx < len(left_columns) and col in left_column_keys_indices: + # Include keys only once and coalesce left+right table keys. + projected_col_names.append(col) + # Get the index of the right key that is being paired + # with this left key. We do so by retrieving the name + # of the right key that is in the same position in the provided keys + # and then looking up the index for that name in the right table. + right_key_index = right_column_keys_indices[ + right_keys[left_keys_order[col]]] + projections.append( + Expression._call("coalesce", [ + Expression._field(idx), Expression._field( + right_operand_index+right_key_index) + ]) + ) + elif idx >= right_operand_index and col in right_column_keys_indices: + # Do not include right table keys. As they would lead to duplicated keys + continue + else: + # For all the other columns include them as they are. + # Just recompute the suffixes that the join produced as the projection + # would lose them otherwise. + if ( + left_suffix and idx < right_operand_index + and col in right_columns_set + ): + col += left_suffix + if ( + right_suffix and idx >= right_operand_index + and col in left_columns_set + ): + col += right_suffix + projected_col_names.append(col) + projections.append( + Expression._field(idx) + ) + projection = Declaration( + "project", ProjectNodeOptions(projections, projected_col_names) + ) + decl = Declaration.from_sequence([decl, projection]) + + result_table = decl.to_table(use_threads=use_threads) + + if output_type == Table: + return result_table + elif output_type == ds.InMemoryDataset: + return ds.InMemoryDataset(result_table) + else: + raise TypeError("Unsupported output type") + + +def _perform_join_asof(left_operand, left_on, left_by, + right_operand, right_on, right_by, + tolerance, use_threads=True, + output_type=Table): + """ + Perform asof join of two tables or datasets. + + The result will be an output table with the result of the join operation + + Parameters + ---------- + left_operand : Table or Dataset + The left operand for the join operation. + left_on : str + The left key (or keys) on which the join operation should be performed. + left_by: str or list[str] + The left key (or keys) on which the join operation should be performed. + right_operand : Table or Dataset + The right operand for the join operation. + right_on : str or list[str] + The right key (or keys) on which the join operation should be performed. + right_by: str or list[str] + The right key (or keys) on which the join operation should be performed. + tolerance : int + The tolerance to use for the asof join. The tolerance is interpreted in + the same units as the "on" key. + output_type: Table or InMemoryDataset + The output type for the exec plan result. + + Returns + ------- + result_table : Table or InMemoryDataset + """ + if not isinstance(left_operand, (Table, ds.Dataset)): + raise TypeError(f"Expected Table or Dataset, got {type(left_operand)}") + if not isinstance(right_operand, (Table, ds.Dataset)): + raise TypeError(f"Expected Table or Dataset, got {type(right_operand)}") + + if not isinstance(left_by, (tuple, list)): + left_by = [left_by] + if not isinstance(right_by, (tuple, list)): + right_by = [right_by] + + # AsofJoin does not return on or by columns for right_operand. + right_columns = [ + col for col in right_operand.schema.names + if col not in [right_on] + right_by + ] + columns_collisions = set(left_operand.schema.names) & set(right_columns) + if columns_collisions: + raise ValueError( + "Columns {} present in both tables. AsofJoin does not support " + "column collisions.".format(columns_collisions), + ) + + # Add the join node to the execplan + if isinstance(left_operand, ds.Dataset): + left_source = _dataset_to_decl(left_operand, use_threads=use_threads) + else: + left_source = Declaration( + "table_source", TableSourceNodeOptions(left_operand), + ) + if isinstance(right_operand, ds.Dataset): + right_source = _dataset_to_decl(right_operand, use_threads=use_threads) + else: + right_source = Declaration( + "table_source", TableSourceNodeOptions(right_operand) + ) + + join_opts = AsofJoinNodeOptions( + left_on, left_by, right_on, right_by, tolerance + ) + decl = Declaration( + "asofjoin", options=join_opts, inputs=[left_source, right_source] + ) + + result_table = decl.to_table(use_threads=use_threads) + + if output_type == Table: + return result_table + elif output_type == ds.InMemoryDataset: + return ds.InMemoryDataset(result_table) + else: + raise TypeError("Unsupported output type") + + +def _filter_table(table, expression): + """Filter rows of a table based on the provided expression. + + The result will be an output table with only the rows matching + the provided expression. + + Parameters + ---------- + table : Table or RecordBatch + Table that should be filtered. + expression : Expression + The expression on which rows should be filtered. + + Returns + ------- + Table + """ + is_batch = False + if isinstance(table, RecordBatch): + table = Table.from_batches([table]) + is_batch = True + + decl = Declaration.from_sequence([ + Declaration("table_source", options=TableSourceNodeOptions(table)), + Declaration("filter", options=FilterNodeOptions(expression)) + ]) + result = decl.to_table(use_threads=True) + if is_batch: + result = result.combine_chunks().to_batches()[0] + return result + + +def _sort_source(table_or_dataset, sort_keys, output_type=Table, **kwargs): + + if isinstance(table_or_dataset, ds.Dataset): + data_source = _dataset_to_decl(table_or_dataset, use_threads=True) + else: + data_source = Declaration( + "table_source", TableSourceNodeOptions(table_or_dataset) + ) + + order_by = Declaration("order_by", OrderByNodeOptions(sort_keys, **kwargs)) + + decl = Declaration.from_sequence([data_source, order_by]) + result_table = decl.to_table(use_threads=True) + + if output_type == Table: + return result_table + elif output_type == ds.InMemoryDataset: + return ds.InMemoryDataset(result_table) + else: + raise TypeError("Unsupported output type") + + +def _group_by(table, aggregates, keys, use_threads=True): + + decl = Declaration.from_sequence([ + Declaration("table_source", TableSourceNodeOptions(table)), + Declaration("aggregate", AggregateNodeOptions(aggregates, keys=keys)) + ]) + return decl.to_table(use_threads=use_threads) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/array.pxi b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/array.pxi new file mode 100644 index 0000000000000000000000000000000000000000..ae9e7fd777ed1efaa13ce9f5273cb81e5c1aa033 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/array.pxi @@ -0,0 +1,4792 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New + +import os +import warnings +from cython import sizeof + + +cdef _sequence_to_array(object sequence, object mask, object size, + DataType type, CMemoryPool* pool, c_bool from_pandas): + cdef: + int64_t c_size + PyConversionOptions options + shared_ptr[CChunkedArray] chunked + + if type is not None: + options.type = type.sp_type + + if size is not None: + options.size = size + + options.from_pandas = from_pandas + options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) + + with nogil: + chunked = GetResultValue( + ConvertPySequence(sequence, mask, options, pool) + ) + + if chunked.get().num_chunks() == 1: + return pyarrow_wrap_array(chunked.get().chunk(0)) + else: + return pyarrow_wrap_chunked_array(chunked) + + +cdef inline _is_array_like(obj): + if np is None: + return False + if isinstance(obj, np.ndarray): + return True + return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj) + + +def _ndarray_to_arrow_type(object values, DataType type): + return pyarrow_wrap_data_type(_ndarray_to_type(values, type)) + + +cdef shared_ptr[CDataType] _ndarray_to_type(object values, + DataType type) except *: + cdef shared_ptr[CDataType] c_type + + dtype = values.dtype + + if type is None and dtype != object: + c_type = GetResultValue(NumPyDtypeToArrow(dtype)) + + if type is not None: + c_type = type.sp_type + + return c_type + + +cdef _ndarray_to_array(object values, object mask, DataType type, + c_bool from_pandas, c_bool safe, CMemoryPool* pool): + cdef: + shared_ptr[CChunkedArray] chunked_out + shared_ptr[CDataType] c_type = _ndarray_to_type(values, type) + CCastOptions cast_options = CCastOptions(safe) + + with nogil: + check_status(NdarrayToArrow(pool, values, mask, from_pandas, + c_type, cast_options, &chunked_out)) + + if chunked_out.get().num_chunks() > 1: + return pyarrow_wrap_chunked_array(chunked_out) + else: + return pyarrow_wrap_array(chunked_out.get().chunk(0)) + + +cdef _codes_to_indices(object codes, object mask, DataType type, + MemoryPool memory_pool): + """ + Convert the codes of a pandas Categorical to indices for a pyarrow + DictionaryArray, taking into account missing values + mask + """ + if mask is None: + mask = codes == -1 + else: + mask = mask | (codes == -1) + return array(codes, mask=mask, type=type, memory_pool=memory_pool) + + +def _handle_arrow_array_protocol(obj, type, mask, size): + if mask is not None or size is not None: + raise ValueError( + "Cannot specify a mask or a size when passing an object that is " + "converted with the __arrow_array__ protocol.") + res = obj.__arrow_array__(type=type) + if not isinstance(res, (Array, ChunkedArray)): + raise TypeError("The object's __arrow_array__ method does not " + "return a pyarrow Array or ChunkedArray.") + if isinstance(res, ChunkedArray) and res.num_chunks==1: + res = res.chunk(0) + return res + + +def array(object obj, type=None, mask=None, size=None, from_pandas=None, + bint safe=True, MemoryPool memory_pool=None): + """ + Create pyarrow.Array instance from a Python object. + + Parameters + ---------- + obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) + can be passed as well. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data. + mask : array[bool], optional + Indicate which values are null (True) or not null (False). + size : int64, optional + Size of the elements. If the input is larger than size bail at this + length. For iterators, if size is larger than the input iterator this + will be treated as a "max size", but will involve an initial allocation + of size followed by a resize to the actual size (so if you know the + exact size specifying it correctly will give you better performance). + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. If passed, the mask tasks precedence, but + if a value is unmasked (not-null), but still null according to + pandas semantics, then it is null. Defaults to False if not + passed explicitly by user, or True if a pandas object is + passed in. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + A ChunkedArray instead of an Array is returned if: + + - the object data overflowed binary storage. + - the object's ``__arrow_array__`` protocol method returned a chunked + array. + + Notes + ----- + Timezone will be preserved in the returned array for timezone-aware data, + else no timezone will be returned for naive timestamps. + Internally, UTC values are stored for timezone-aware data with the + timezone set in the data type. + + Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by + default converted as MonthDayNanoIntervalArray. relativedelta leapdays + are ignored as are all absolute fields on both objects. datetime.timedelta + can also be converted to MonthDayNanoIntervalArray but this requires + passing MonthDayNanoIntervalType explicitly. + + Converting to dictionary array will promote to a wider integer type for + indices if the number of distinct values cannot be represented, even if + the index type was explicitly set. This means that if there are more than + 127 values the returned dictionary array's index type will be at least + pa.int16() even if pa.int8() was passed to the function. Note that an + explicit index type will not be demoted even if it is wider than required. + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.array(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + + ... + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1, + 0 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) + + [ + 1, + null + ] + + >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) + >>> arr.type.index_type + DataType(int16) + """ + cdef: + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + bint is_pandas_object = False + bint c_from_pandas + + type = ensure_type(type, allow_none=True) + + extension_type = None + if type is not None and type.id == _Type_EXTENSION: + extension_type = type + type = type.storage_type + + if from_pandas is None: + c_from_pandas = False + else: + c_from_pandas = from_pandas + + if isinstance(obj, Array): + if type is not None and not obj.type.equals(type): + obj = obj.cast(type, safe=safe, memory_pool=memory_pool) + return obj + + if hasattr(obj, '__arrow_array__'): + return _handle_arrow_array_protocol(obj, type, mask, size) + elif hasattr(obj, '__arrow_c_device_array__'): + if type is not None: + requested_type = type.__arrow_c_schema__() + else: + requested_type = None + schema_capsule, array_capsule = obj.__arrow_c_device_array__(requested_type) + out_array = Array._import_from_c_device_capsule(schema_capsule, array_capsule) + if type is not None and out_array.type != type: + # PyCapsule interface type coercion is best effort, so we need to + # check the type of the returned array and cast if necessary + out_array = array.cast(type, safe=safe, memory_pool=memory_pool) + return out_array + elif hasattr(obj, '__arrow_c_array__'): + if type is not None: + requested_type = type.__arrow_c_schema__() + else: + requested_type = None + schema_capsule, array_capsule = obj.__arrow_c_array__(requested_type) + out_array = Array._import_from_c_capsule(schema_capsule, array_capsule) + if type is not None and out_array.type != type: + # PyCapsule interface type coercion is best effort, so we need to + # check the type of the returned array and cast if necessary + out_array = array.cast(type, safe=safe, memory_pool=memory_pool) + return out_array + elif _is_array_like(obj): + if mask is not None: + if _is_array_like(mask): + mask = get_values(mask, &is_pandas_object) + else: + raise TypeError("Mask must be a numpy array " + "when converting numpy arrays") + + values = get_values(obj, &is_pandas_object) + if is_pandas_object and from_pandas is None: + c_from_pandas = True + + if isinstance(values, np.ma.MaskedArray): + if mask is not None: + raise ValueError("Cannot pass a numpy masked array and " + "specify a mask at the same time") + else: + # don't use shrunken masks + mask = None if values.mask is np.ma.nomask else values.mask + values = values.data + + if mask is not None: + if mask.dtype != np.bool_: + raise TypeError("Mask must be boolean dtype") + if mask.ndim != 1: + raise ValueError("Mask must be 1D array") + if len(values) != len(mask): + raise ValueError( + "Mask is a different length from sequence being converted") + + if hasattr(values, '__arrow_array__'): + return _handle_arrow_array_protocol(values, type, mask, size) + elif (pandas_api.is_categorical(values) and + type is not None and type.id != Type_DICTIONARY): + result = _ndarray_to_array( + np.asarray(values), mask, type, c_from_pandas, safe, pool + ) + elif pandas_api.is_categorical(values): + if type is not None: + index_type = type.index_type + value_type = type.value_type + if values.ordered != type.ordered: + raise ValueError( + "The 'ordered' flag of the passed categorical values " + "does not match the 'ordered' of the specified type. ") + else: + index_type = None + value_type = None + + indices = _codes_to_indices( + values.codes, mask, index_type, memory_pool) + try: + dictionary = array( + values.categories.values, type=value_type, + memory_pool=memory_pool) + except TypeError: + # TODO when removing the deprecation warning, this whole + # try/except can be removed (to bubble the TypeError of + # the first array(..) call) + if value_type is not None: + warnings.warn( + "The dtype of the 'categories' of the passed " + "categorical values ({0}) does not match the " + "specified type ({1}). For now ignoring the specified " + "type, but in the future this mismatch will raise a " + "TypeError".format( + values.categories.dtype, value_type), + FutureWarning, stacklevel=2) + dictionary = array( + values.categories.values, memory_pool=memory_pool) + else: + raise + + return DictionaryArray.from_arrays( + indices, dictionary, ordered=values.ordered, safe=safe) + else: + if pandas_api.have_pandas: + values, type = pandas_api.compat.get_datetimetz_type( + values, obj.dtype, type) + if type and type.id == _Type_RUN_END_ENCODED: + arr = _ndarray_to_array( + values, mask, type.value_type, c_from_pandas, safe, pool) + result = _pc().run_end_encode(arr, run_end_type=type.run_end_type, + memory_pool=memory_pool) + else: + result = _ndarray_to_array(values, mask, type, c_from_pandas, safe, + pool) + else: + if type and type.id == _Type_RUN_END_ENCODED: + arr = _sequence_to_array( + obj, mask, size, type.value_type, pool, from_pandas) + result = _pc().run_end_encode(arr, run_end_type=type.run_end_type, + memory_pool=memory_pool) + # ConvertPySequence does strict conversion if type is explicitly passed + else: + result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas) + + if extension_type is not None: + result = ExtensionArray.from_storage(extension_type, result) + return result + + +def asarray(values, type=None): + """ + Convert to pyarrow.Array, inferring type if not provided. + + Parameters + ---------- + values : array-like + This can be a sequence, numpy.ndarray, pyarrow.Array or + pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be + a ChunkedArray, otherwise the output will be a Array. + type : string or DataType + Explicitly construct the array with this type. Attempt to cast if + indicated type is different. + + Returns + ------- + arr : Array or ChunkedArray + """ + if isinstance(values, (Array, ChunkedArray)): + if type is not None and not values.type.equals(type): + values = values.cast(type) + return values + else: + return array(values, type=type) + + +def nulls(size, type=None, MemoryPool memory_pool=None): + """ + Create a strongly-typed Array instance with all elements null. + + Parameters + ---------- + size : int + Array length. + type : pyarrow.DataType, default None + Explicit type for the array. By default use NullType. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.nulls(10) + + 10 nulls + + >>> pa.nulls(3, pa.uint32()) + + [ + null, + null, + null + ] + """ + cdef: + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + int64_t length = size + shared_ptr[CDataType] ty + shared_ptr[CArray] arr + + type = ensure_type(type, allow_none=True) + if type is None: + type = null() + + ty = pyarrow_unwrap_data_type(type) + with nogil: + arr = GetResultValue(MakeArrayOfNull(ty, length, pool)) + + return pyarrow_wrap_array(arr) + + +def repeat(value, size, MemoryPool memory_pool=None): + """ + Create an Array instance whose slots are the given scalar. + + Parameters + ---------- + value : Scalar-like object + Either a pyarrow.Scalar or any python object coercible to a Scalar. + size : int + Number of times to repeat the scalar in the output Array. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.repeat(10, 3) + + [ + 10, + 10, + 10 + ] + + >>> pa.repeat([1, 2], 2) + + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + + >>> pa.repeat("string", 3) + + [ + "string", + "string", + "string" + ] + + >>> pa.repeat(pa.scalar({'a': 1, 'b': [1, 2]}), 2) + + -- is_valid: all not null + -- child 0 type: int64 + [ + 1, + 1 + ] + -- child 1 type: list + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + """ + cdef: + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + int64_t length = size + shared_ptr[CArray] c_array + shared_ptr[CScalar] c_scalar + + if not isinstance(value, Scalar): + value = scalar(value, memory_pool=memory_pool) + + c_scalar = ( value).unwrap() + with nogil: + c_array = GetResultValue( + MakeArrayFromScalar(deref(c_scalar), length, pool) + ) + + return pyarrow_wrap_array(c_array) + + +def infer_type(values, mask=None, from_pandas=False): + """ + Attempt to infer Arrow data type that can hold the passed Python + sequence type in an Array object + + Parameters + ---------- + values : array-like + Sequence to infer type from. + mask : ndarray (bool type), optional + Optional exclusion mask where True marks null, False non-null. + from_pandas : bool, default False + Use pandas's NA/null sentinel values for type inference. + + Returns + ------- + type : DataType + """ + cdef: + shared_ptr[CDataType] out + c_bool use_pandas_sentinels = from_pandas + + if mask is not None and not isinstance(mask, np.ndarray): + mask = np.array(mask, dtype=bool) + + out = GetResultValue(InferArrowType(values, mask, use_pandas_sentinels)) + return pyarrow_wrap_data_type(out) + + +def _normalize_slice(object arrow_obj, slice key): + """ + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + """ + cdef: + Py_ssize_t start, stop, step + Py_ssize_t n = len(arrow_obj) + + start, stop, step = key.indices(n) + + if step != 1: + indices = np.arange(start, stop, step) + return arrow_obj.take(indices) + else: + length = max(stop - start, 0) + return arrow_obj.slice(start, length) + + +cdef Py_ssize_t _normalize_index(Py_ssize_t index, + Py_ssize_t length) except -1: + if index < 0: + index += length + if index < 0: + raise IndexError("index out of bounds") + elif index >= length: + raise IndexError("index out of bounds") + return index + + +cdef wrap_datum(const CDatum& datum): + if datum.kind() == DatumType_ARRAY: + return pyarrow_wrap_array(MakeArray(datum.array())) + elif datum.kind() == DatumType_CHUNKED_ARRAY: + return pyarrow_wrap_chunked_array(datum.chunked_array()) + elif datum.kind() == DatumType_RECORD_BATCH: + return pyarrow_wrap_batch(datum.record_batch()) + elif datum.kind() == DatumType_TABLE: + return pyarrow_wrap_table(datum.table()) + elif datum.kind() == DatumType_SCALAR: + return pyarrow_wrap_scalar(datum.scalar()) + else: + raise ValueError("Unable to wrap Datum in a Python object") + + +cdef _append_array_buffers(const CArrayData* ad, list res): + """ + Recursively append Buffer wrappers from *ad* and its children. + """ + cdef size_t i, n + assert ad != NULL + n = ad.buffers.size() + for i in range(n): + buf = ad.buffers[i] + res.append(pyarrow_wrap_buffer(buf) + if buf.get() != NULL else None) + n = ad.child_data.size() + for i in range(n): + _append_array_buffers(ad.child_data[i].get(), res) + + +cdef _reduce_array_data(const CArrayData* ad): + """ + Recursively dissect ArrayData to (pickable) tuples. + """ + cdef size_t i, n + assert ad != NULL + + n = ad.buffers.size() + buffers = [] + for i in range(n): + buf = ad.buffers[i] + buffers.append(pyarrow_wrap_buffer(buf) + if buf.get() != NULL else None) + + children = [] + n = ad.child_data.size() + for i in range(n): + children.append(_reduce_array_data(ad.child_data[i].get())) + + if ad.dictionary.get() != NULL: + dictionary = _reduce_array_data(ad.dictionary.get()) + else: + dictionary = None + + return pyarrow_wrap_data_type(ad.type), ad.length, ad.null_count, \ + ad.offset, buffers, children, dictionary + + +cdef shared_ptr[CArrayData] _reconstruct_array_data(data): + """ + Reconstruct CArrayData objects from the tuple structure generated + by _reduce_array_data. + """ + cdef: + int64_t length, null_count, offset, i + DataType dtype + Buffer buf + vector[shared_ptr[CBuffer]] c_buffers + vector[shared_ptr[CArrayData]] c_children + shared_ptr[CArrayData] c_dictionary + + dtype, length, null_count, offset, buffers, children, dictionary = data + + for i in range(len(buffers)): + buf = buffers[i] + if buf is None: + c_buffers.push_back(shared_ptr[CBuffer]()) + else: + c_buffers.push_back(buf.buffer) + + for i in range(len(children)): + c_children.push_back(_reconstruct_array_data(children[i])) + + if dictionary is not None: + c_dictionary = _reconstruct_array_data(dictionary) + + return CArrayData.MakeWithChildrenAndDictionary( + dtype.sp_type, + length, + c_buffers, + c_children, + c_dictionary, + null_count, + offset) + + +def _restore_array(data): + """ + Reconstruct an Array from pickled ArrayData. + """ + cdef shared_ptr[CArrayData] ad = _reconstruct_array_data(data) + return pyarrow_wrap_array(MakeArray(ad)) + + +cdef class _PandasConvertible(_Weakrefable): + + def to_pandas( + self, + memory_pool=None, + categories=None, + bint strings_to_categorical=False, + bint zero_copy_only=False, + bint integer_object_nulls=False, + bint date_as_object=True, + bint timestamp_as_object=False, + bint use_threads=True, + bint deduplicate_objects=True, + bint ignore_metadata=False, + bint safe=True, + bint split_blocks=False, + bint self_destruct=False, + str maps_as_pydicts=None, + types_mapper=None, + bint coerce_temporal_nanoseconds=False + ): + """ + Convert to a pandas-compatible NumPy array or DataFrame, as appropriate + + Parameters + ---------- + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + categories : list, default empty + List of fields that should be returned as pandas.Categorical. Only + applies to table-like data structures. + strings_to_categorical : bool, default False + Encode string (UTF8) and binary types to pandas.Categorical. + zero_copy_only : bool, default False + Raise an ArrowException if this function call would require copying + the underlying data. + integer_object_nulls : bool, default False + Cast integers with nulls to objects + date_as_object : bool, default True + Cast dates to objects. If False, convert to datetime64 dtype with + the equivalent time unit (if supported). Note: in pandas version + < 2.0, only datetime64[ns] conversion is supported. + timestamp_as_object : bool, default False + Cast non-nanosecond timestamps (np.datetime64) to objects. This is + useful in pandas version 1.x if you have timestamps that don't fit + in the normal date range of nanosecond timestamps (1678 CE-2262 CE). + Non-nanosecond timestamps are supported in pandas version 2.0. + If False, all timestamps are converted to datetime64 dtype. + use_threads : bool, default True + Whether to parallelize the conversion using multiple threads. + deduplicate_objects : bool, default True + Do not create multiple copies Python objects when created, to save + on memory use. Conversion will be slower. + ignore_metadata : bool, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present + safe : bool, default True + For certain data types, a cast is needed in order to store the + data in a pandas DataFrame or Series (e.g. timestamps are always + stored as nanoseconds in pandas). This option controls whether it + is a safe cast or not. + split_blocks : bool, default False + If True, generate one internal "block" for each column when + creating a pandas.DataFrame from a RecordBatch or Table. While this + can temporarily reduce memory note that various pandas operations + can trigger "consolidation" which may balloon memory use. + self_destruct : bool, default False + EXPERIMENTAL: If True, attempt to deallocate the originating Arrow + memory while converting the Arrow object to pandas. If you use the + object after calling to_pandas with this option it will crash your + program. + + Note that you may not see always memory usage improvements. For + example, if multiple columns share an underlying allocation, + memory can't be freed until all columns are converted. + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. + types_mapper : function, default None + A function mapping a pyarrow DataType to a pandas ExtensionDtype. + This can be used to override the default pandas type for conversion + of built-in pyarrow types or in absence of pandas_metadata in the + Table schema. The function receives a pyarrow DataType and is + expected to return a pandas ExtensionDtype or ``None`` if the + default conversion should be used for that type. If you have + a dictionary mapping, you can pass ``dict.get`` as function. + coerce_temporal_nanoseconds : bool, default False + Only applicable to pandas version >= 2.0. + A legacy option to coerce date32, date64, duration, and timestamp + time units to nanoseconds when converting to pandas. This is the + default behavior in pandas version 1.x. Set this option to True if + you'd like to use this coercion when using pandas version >= 2.0 + for backwards compatibility (not recommended otherwise). + + Returns + ------- + pandas.Series or pandas.DataFrame depending on type of object + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + + Convert a Table to pandas DataFrame: + + >>> table = pa.table([ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + ... ], names=['n_legs', 'animals']) + >>> table.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(table.to_pandas(), pd.DataFrame) + True + + Convert a RecordBatch to pandas DataFrame: + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> batch = pa.record_batch([n_legs, animals], + ... names=["n_legs", "animals"]) + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(batch.to_pandas(), pd.DataFrame) + True + + Convert a Chunked Array to pandas Series: + + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_pandas() + 0 2 + 1 2 + 2 4 + 3 4 + 4 5 + 5 100 + dtype: int64 + >>> isinstance(n_legs.to_pandas(), pd.Series) + True + """ + options = dict( + pool=memory_pool, + strings_to_categorical=strings_to_categorical, + zero_copy_only=zero_copy_only, + integer_object_nulls=integer_object_nulls, + date_as_object=date_as_object, + timestamp_as_object=timestamp_as_object, + use_threads=use_threads, + deduplicate_objects=deduplicate_objects, + safe=safe, + split_blocks=split_blocks, + self_destruct=self_destruct, + maps_as_pydicts=maps_as_pydicts, + coerce_temporal_nanoseconds=coerce_temporal_nanoseconds + ) + return self._to_pandas(options, categories=categories, + ignore_metadata=ignore_metadata, + types_mapper=types_mapper) + + +cdef PandasOptions _convert_pandas_options(dict options): + cdef PandasOptions result + result.pool = maybe_unbox_memory_pool(options['pool']) + result.strings_to_categorical = options['strings_to_categorical'] + result.zero_copy_only = options['zero_copy_only'] + result.integer_object_nulls = options['integer_object_nulls'] + result.date_as_object = options['date_as_object'] + result.timestamp_as_object = options['timestamp_as_object'] + result.use_threads = options['use_threads'] + result.deduplicate_objects = options['deduplicate_objects'] + result.safe_cast = options['safe'] + result.split_blocks = options['split_blocks'] + result.self_destruct = options['self_destruct'] + result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds'] + result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) + + maps_as_pydicts = options['maps_as_pydicts'] + if maps_as_pydicts is None: + result.maps_as_pydicts = MapConversionType.DEFAULT + elif maps_as_pydicts == "lossy": + result.maps_as_pydicts = MapConversionType.LOSSY + elif maps_as_pydicts == "strict": + result.maps_as_pydicts = MapConversionType.STRICT_ + else: + raise ValueError( + "Invalid value for 'maps_as_pydicts': " + + "valid values are 'lossy', 'strict' or `None` (default). " + + f"Received '{maps_as_pydicts}'." + ) + return result + + +cdef class Array(_PandasConvertible): + """ + The base class for all Arrow arrays. + """ + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly, use one of " + "the `pyarrow.Array.from_*` functions instead." + .format(self.__class__.__name__)) + + cdef void init(self, const shared_ptr[CArray]& sp_array) except *: + self.sp_array = sp_array + self.ap = sp_array.get() + self.type = pyarrow_wrap_data_type(self.sp_array.get().type()) + + def _debug_print(self): + with nogil: + check_status(DebugPrint(deref(self.ap), 0)) + + def diff(self, Array other): + """ + Compare contents of this array against another one. + + Return a string containing the result of diffing this array + (on the left side) against the other array (on the right side). + + Parameters + ---------- + other : Array + The other array to compare this array with. + + Returns + ------- + diff : str + A human-readable printout of the differences. + + Examples + -------- + >>> import pyarrow as pa + >>> left = pa.array(["one", "two", "three"]) + >>> right = pa.array(["two", None, "two-and-a-half", "three"]) + >>> print(left.diff(right)) # doctest: +SKIP + + @@ -0, +0 @@ + -"one" + @@ -2, +1 @@ + +null + +"two-and-a-half" + + """ + self._assert_cpu() + cdef c_string result + with nogil: + result = self.ap.Diff(deref(other.ap)) + return frombytes(result, safe=True) + + def cast(self, object target_type=None, safe=None, options=None, memory_pool=None): + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + cast : Array + """ + self._assert_cpu() + return _pc().cast(self, target_type, safe=safe, + options=options, memory_pool=memory_pool) + + def view(self, object target_type): + """ + Return zero-copy "view" of array as another data type. + + The data types must have compatible columnar buffer layouts + + Parameters + ---------- + target_type : DataType + Type to construct view as. + + Returns + ------- + view : Array + """ + self._assert_cpu() + cdef DataType type = ensure_type(target_type) + cdef shared_ptr[CArray] result + with nogil: + result = GetResultValue(self.ap.View(type.sp_type)) + return pyarrow_wrap_array(result) + + def sum(self, **kwargs): + """ + Sum the values in a numerical array. + + See :func:`pyarrow.compute.sum` for full usage. + + Parameters + ---------- + **kwargs : dict, optional + Options to pass to :func:`pyarrow.compute.sum`. + + Returns + ------- + sum : Scalar + A scalar containing the sum value. + """ + self._assert_cpu() + options = _pc().ScalarAggregateOptions(**kwargs) + return _pc().call_function('sum', [self], options) + + def unique(self): + """ + Compute distinct elements in array. + + Returns + ------- + unique : Array + An array of the same data type, with deduplicated elements. + """ + self._assert_cpu() + return _pc().call_function('unique', [self]) + + def dictionary_encode(self, null_encoding='mask'): + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : DictionaryArray + A dictionary-encoded version of this array. + """ + self._assert_cpu() + options = _pc().DictionaryEncodeOptions(null_encoding) + return _pc().call_function('dictionary_encode', [self], options) + + def value_counts(self): + """ + Compute counts of unique elements in array. + + Returns + ------- + StructArray + An array of structs + """ + self._assert_cpu() + return _pc().call_function('value_counts', [self]) + + @staticmethod + def from_pandas(obj, mask=None, type=None, bint safe=True, + MemoryPool memory_pool=None): + """ + Convert pandas.Series to an Arrow Array. + + This method uses Pandas semantics about what values indicate + nulls. See pyarrow.array for more general conversion from arrays or + sequences to Arrow arrays. + + Parameters + ---------- + obj : ndarray, pandas.Series, array-like + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred + from the data. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + ChunkedArray is returned if object data overflows binary buffer. + """ + return array(obj, mask=mask, type=type, safe=safe, from_pandas=True, + memory_pool=memory_pool) + + def __reduce__(self): + self._assert_cpu() + return _restore_array, \ + (_reduce_array_data(self.sp_array.get().data().get()),) + + @staticmethod + def from_buffers(DataType type, length, buffers, null_count=-1, offset=0, + children=None): + """ + Construct an Array from a sequence of buffers. + + The concrete type returned depends on the datatype. + + Parameters + ---------- + type : DataType + The value type of the array. + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing this array. + null_count : int, default -1 + The number of null entries in the array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array], default None + Nested type children with length matching type.num_fields. + + Returns + ------- + array : Array + """ + cdef: + Buffer buf + Array child + vector[shared_ptr[CBuffer]] c_buffers + vector[shared_ptr[CArrayData]] c_child_data + shared_ptr[CArrayData] array_data + + children = children or [] + + if type.num_fields != len(children): + raise ValueError("Type's expected number of children " + "({0}) did not match the passed number " + "({1}).".format(type.num_fields, len(children))) + + if type.num_buffers != len(buffers): + raise ValueError("Type's expected number of buffers " + "({0}) did not match the passed number " + "({1}).".format(type.num_buffers, len(buffers))) + + for buf in buffers: + # None will produce a null buffer pointer + c_buffers.push_back(pyarrow_unwrap_buffer(buf)) + + for child in children: + c_child_data.push_back(child.ap.data()) + + array_data = CArrayData.MakeWithChildren(type.sp_type, length, + c_buffers, c_child_data, + null_count, offset) + cdef Array result = pyarrow_wrap_array(MakeArray(array_data)) + result.validate() + return result + + @property + def null_count(self): + self._assert_cpu() + return self.sp_array.get().null_count() + + @property + def nbytes(self): + """ + Total number of bytes consumed by the elements of the array. + + In other words, the sum of bytes from all buffer + ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + """ + self._assert_cpu() + cdef CResult[int64_t] c_size_res + with nogil: + c_size_res = ReferencedBufferSize(deref(self.ap)) + size = GetResultValue(c_size_res) + return size + + def get_total_buffer_size(self): + """ + The sum of bytes in each buffer referenced by the array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + """ + self._assert_cpu() + cdef int64_t total_buffer_size + total_buffer_size = TotalBufferSize(deref(self.ap)) + return total_buffer_size + + def __sizeof__(self): + self._assert_cpu() + return super(Array, self).__sizeof__() + self.nbytes + + def __iter__(self): + self._assert_cpu() + for i in range(len(self)): + yield self.getitem(i) + + def __repr__(self): + type_format = object.__repr__(self) + return '{0}\n{1}'.format(type_format, str(self)) + + def to_string(self, *, int indent=2, int top_level_indent=0, int window=10, + int container_window=2, c_bool skip_new_lines=False): + """ + Render a "pretty-printed" string representation of the Array. + + Note: for data on a non-CPU device, the full array is copied to CPU + memory. + + Parameters + ---------- + indent : int, default 2 + How much to indent the internal items in the string to + the right, by default ``2``. + top_level_indent : int, default 0 + How much to indent right the entire content of the array, + by default ``0``. + window : int + How many primitive items to preview at the begin and end + of the array when the array is bigger than the window. + The other items will be ellipsed. + container_window : int + How many container items (such as a list in a list array) + to preview at the begin and end of the array when the array + is bigger than the window. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + """ + cdef: + c_string result + PrettyPrintOptions options + + with nogil: + options = PrettyPrintOptions(top_level_indent, window) + options.skip_new_lines = skip_new_lines + options.indent_size = indent + check_status( + PrettyPrint( + deref(self.ap), + options, + &result + ) + ) + + return frombytes(result, safe=True) + + def format(self, **kwargs): + """ + DEPRECATED, use pyarrow.Array.to_string + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + str + """ + import warnings + warnings.warn('Array.format is deprecated, use Array.to_string') + return self.to_string(**kwargs) + + def __str__(self): + return self.to_string() + + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + # This also handles comparing with None + # as Array.equals(None) raises a TypeError. + return NotImplemented + + def equals(Array self, Array other not None): + """ + Parameters + ---------- + other : pyarrow.Array + + Returns + ------- + bool + """ + self._assert_cpu() + other._assert_cpu() + return self.ap.Equals(deref(other.ap)) + + def __len__(self): + return self.length() + + cdef int64_t length(self): + if self.sp_array.get(): + return self.sp_array.get().length() + else: + return 0 + + def is_null(self, *, nan_is_null=False): + """ + Return BooleanArray indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array + """ + self._assert_cpu() + options = _pc().NullOptions(nan_is_null=nan_is_null) + return _pc().call_function('is_null', [self], options) + + def is_nan(self): + """ + Return BooleanArray indicating the NaN values. + + Returns + ------- + array : boolean Array + """ + self._assert_cpu() + return _pc().call_function('is_nan', [self]) + + def is_valid(self): + """ + Return BooleanArray indicating the non-null values. + """ + self._assert_cpu() + return _pc().is_valid(self) + + def fill_null(self, fill_value): + """ + See :func:`pyarrow.compute.fill_null` for usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array + A new array with nulls replaced by the given value. + """ + self._assert_cpu() + return _pc().fill_null(self, fill_value) + + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or Array (slice) + """ + self._assert_cpu() + if isinstance(key, slice): + return _normalize_slice(self, key) + + return self.getitem(_normalize_index(key, self.length())) + + cdef getitem(self, int64_t i): + self._assert_cpu() + return Scalar.wrap(GetResultValue(self.ap.GetScalar(i))) + + def slice(self, offset=0, length=None): + """ + Compute zero-copy slice of this array. + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice. + length : int, default None + Length of slice (default is until end of Array starting from + offset). + + Returns + ------- + sliced : Array + An array with the same datatype, containing the sliced values. + """ + cdef shared_ptr[CArray] result + + if offset < 0: + raise IndexError('Offset must be non-negative') + + offset = min(len(self), offset) + if length is None: + result = self.ap.Slice(offset) + else: + if length < 0: + raise ValueError('Length must be non-negative') + result = self.ap.Slice(offset, length) + + return pyarrow_wrap_array(result) + + def take(self, object indices): + """ + Select values from an array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array + An array with the same datatype, containing the taken values. + """ + self._assert_cpu() + return _pc().take(self, indices) + + def drop_null(self): + """ + Remove missing values from an array. + """ + self._assert_cpu() + return _pc().drop_null(self) + + def filter(self, object mask, *, null_selection_behavior='drop'): + """ + Select values from an array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array + An array of the same type, with only the elements selected by + the boolean mask. + """ + self._assert_cpu() + return _pc().filter(self, mask, + null_selection_behavior=null_selection_behavior) + + def index(self, value, start=None, end=None, *, memory_pool=None): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + """ + self._assert_cpu() + return _pc().index(self, value, start, end, memory_pool=memory_pool) + + def sort(self, order="ascending", **kwargs): + """ + Sort the Array + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : Array + """ + self._assert_cpu() + indices = _pc().sort_indices( + self, + options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) + ) + return self.take(indices) + + def _to_pandas(self, options, types_mapper=None, **kwargs): + self._assert_cpu() + return _array_like_to_pandas(self, options, types_mapper=types_mapper) + + def __array__(self, dtype=None, copy=None): + self._assert_cpu() + + if copy is False: + try: + values = self.to_numpy(zero_copy_only=True) + except ArrowInvalid: + raise ValueError( + "Unable to avoid a copy while creating a numpy array as requested.\n" + "If using `np.array(obj, copy=False)` replace it with " + "`np.asarray(obj)` to allow a copy when needed" + ) + # values is already a numpy array at this point, but calling np.array(..) + # again to handle the `dtype` keyword with a no-copy guarantee + return np.array(values, dtype=dtype, copy=False) + + values = self.to_numpy(zero_copy_only=False) + if copy is True and is_numeric(self.type.id) and self.null_count == 0: + # to_numpy did not yet make a copy (is_numeric = integer/floats, no decimal) + return np.array(values, dtype=dtype, copy=True) + + if dtype is None: + return values + return np.asarray(values, dtype=dtype) + + def to_numpy(self, zero_copy_only=True, writable=False): + """ + Return a NumPy view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for primitive arrays with the same memory layout as NumPy + (i.e. integers, floating point, ..) and without any nulls. + + For the extension arrays, this method simply delegates to the + underlying storage array. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + self._assert_cpu() + + if np is None: + raise ImportError( + "Cannot return a numpy.ndarray if NumPy is not present") + cdef: + PyObject* out + PandasOptions c_options + object values + + if zero_copy_only and writable: + raise ValueError( + "Cannot return a writable array if asking for zero-copy") + + # If there are nulls and the array is a DictionaryArray + # decoding the dictionary will make sure nulls are correctly handled. + # Decoding a dictionary does imply a copy by the way, + # so it can't be done if the user requested a zero_copy. + c_options.decode_dictionaries = True + c_options.zero_copy_only = zero_copy_only + c_options.to_numpy = True + + with nogil: + check_status(ConvertArrayToPandas(c_options, self.sp_array, + self, &out)) + + # wrap_array_output uses pandas to convert to Categorical, here + # always convert to numpy array without pandas dependency + array = PyObject_to_object(out) + + if writable and not array.flags.writeable: + # if the conversion already needed to a copy, writeable is True + array = array.copy() + return array + + def to_pylist(self): + """ + Convert to a list of native Python objects. + + Returns + ------- + lst : list + """ + self._assert_cpu() + return [x.as_py() for x in self] + + def tolist(self): + """ + Alias of to_pylist for compatibility with NumPy. + """ + return self.to_pylist() + + def validate(self, *, full=False): + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + if full: + self._assert_cpu() + with nogil: + check_status(self.ap.ValidateFull()) + else: + with nogil: + check_status(self.ap.Validate()) + + @property + def offset(self): + """ + A relative position into another array's data. + + The purpose is to enable zero-copy slicing. This value defaults to zero + but must be applied on all operations with the physical storage + buffers. + """ + return self.sp_array.get().offset() + + def buffers(self): + """ + Return a list of Buffer objects pointing to this array's physical + storage. + + To correctly interpret these buffers, you need to also apply the offset + multiplied with the size of the stored data type. + """ + res = [] + _append_array_buffers(self.sp_array.get().data().get(), res) + return res + + def copy_to(self, destination): + """ + Construct a copy of the array with all buffers on destination + device. + + This method recursively copies the array's buffers and those of its + children onto the destination MemoryManager device and returns the + new Array. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + Array + """ + cdef: + shared_ptr[CArray] c_array + shared_ptr[CMemoryManager] c_memory_manager + + if isinstance(destination, Device): + c_memory_manager = (destination).unwrap().get().default_memory_manager() + elif isinstance(destination, MemoryManager): + c_memory_manager = (destination).unwrap() + else: + raise TypeError( + "Argument 'destination' has incorrect type (expected a " + f"pyarrow Device or MemoryManager, got {type(destination)})" + ) + + with nogil: + c_array = GetResultValue(self.ap.CopyTo(c_memory_manager)) + return pyarrow_wrap_array(c_array) + + def _export_to_c(self, out_ptr, out_schema_ptr=0): + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + cdef: + void* c_ptr = _as_c_pointer(out_ptr) + void* c_schema_ptr = _as_c_pointer(out_schema_ptr, + allow_null=True) + with nogil: + check_status(ExportArray(deref(self.sp_array), + c_ptr, + c_schema_ptr)) + + @staticmethod + def _import_from_c(in_ptr, type): + """ + Import Array from a C ArrowArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + cdef: + void* c_ptr = _as_c_pointer(in_ptr) + void* c_type_ptr + shared_ptr[CArray] c_array + + c_type = pyarrow_unwrap_data_type(type) + if c_type == nullptr: + # Not a DataType object, perhaps a raw ArrowSchema pointer + c_type_ptr = _as_c_pointer(type) + with nogil: + c_array = GetResultValue(ImportArray( + c_ptr, c_type_ptr)) + else: + with nogil: + c_array = GetResultValue(ImportArray( c_ptr, + c_type)) + return pyarrow_wrap_array(c_array) + + def __arrow_c_array__(self, requested_schema=None): + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + self._assert_cpu() + + cdef: + ArrowArray* c_array + ArrowSchema* c_schema + shared_ptr[CArray] inner_array + + if requested_schema is not None: + target_type = DataType._import_from_c_capsule(requested_schema) + + if target_type != self.type: + try: + casted_array = _pc().cast(self, target_type, safe=True) + inner_array = pyarrow_unwrap_array(casted_array) + except ArrowInvalid as e: + raise ValueError( + f"Could not cast {self.type} to requested type {target_type}: {e}" + ) + else: + inner_array = self.sp_array + else: + inner_array = self.sp_array + + schema_capsule = alloc_c_schema(&c_schema) + array_capsule = alloc_c_array(&c_array) + + with nogil: + check_status(ExportArray(deref(inner_array), c_array, c_schema)) + + return schema_capsule, array_capsule + + @staticmethod + def _import_from_c_capsule(schema_capsule, array_capsule): + cdef: + ArrowSchema* c_schema + ArrowArray* c_array + shared_ptr[CArray] array + + c_schema = PyCapsule_GetPointer(schema_capsule, 'arrow_schema') + c_array = PyCapsule_GetPointer(array_capsule, 'arrow_array') + + with nogil: + array = GetResultValue(ImportArray(c_array, c_schema)) + + return pyarrow_wrap_array(array) + + def _export_to_c_device(self, out_ptr, out_schema_ptr=0): + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + cdef: + void* c_ptr = _as_c_pointer(out_ptr) + void* c_schema_ptr = _as_c_pointer(out_schema_ptr, + allow_null=True) + with nogil: + check_status(ExportDeviceArray( + deref(self.sp_array), NULL, + c_ptr, c_schema_ptr)) + + @staticmethod + def _import_from_c_device(in_ptr, type): + """ + Import Array from a C ArrowDeviceArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + cdef: + ArrowDeviceArray* c_device_array = _as_c_pointer(in_ptr) + void* c_type_ptr + shared_ptr[CArray] c_array + + if c_device_array.device_type == ARROW_DEVICE_CUDA: + _ensure_cuda_loaded() + + c_type = pyarrow_unwrap_data_type(type) + if c_type == nullptr: + # Not a DataType object, perhaps a raw ArrowSchema pointer + c_type_ptr = _as_c_pointer(type) + with nogil: + c_array = GetResultValue( + ImportDeviceArray(c_device_array, c_type_ptr) + ) + else: + with nogil: + c_array = GetResultValue( + ImportDeviceArray(c_device_array, c_type) + ) + return pyarrow_wrap_array(c_array) + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + cdef: + ArrowDeviceArray* c_array + ArrowSchema* c_schema + shared_ptr[CArray] inner_array + + non_default_kwargs = [ + name for name, value in kwargs.items() if value is not None + ] + if non_default_kwargs: + raise NotImplementedError( + f"Received unsupported keyword argument(s): {non_default_kwargs}" + ) + + if requested_schema is not None: + target_type = DataType._import_from_c_capsule(requested_schema) + + if target_type != self.type: + if not self.is_cpu: + raise NotImplementedError( + "Casting to a requested schema is only supported for CPU data" + ) + try: + casted_array = _pc().cast(self, target_type, safe=True) + inner_array = pyarrow_unwrap_array(casted_array) + except ArrowInvalid as e: + raise ValueError( + f"Could not cast {self.type} to requested type {target_type}: {e}" + ) + else: + inner_array = self.sp_array + else: + inner_array = self.sp_array + + schema_capsule = alloc_c_schema(&c_schema) + array_capsule = alloc_c_device_array(&c_array) + + with nogil: + check_status(ExportDeviceArray( + deref(inner_array), NULL, + c_array, c_schema)) + + return schema_capsule, array_capsule + + @staticmethod + def _import_from_c_device_capsule(schema_capsule, array_capsule): + cdef: + ArrowSchema* c_schema + ArrowDeviceArray* c_array + shared_ptr[CArray] array + + c_schema = PyCapsule_GetPointer(schema_capsule, 'arrow_schema') + c_array = PyCapsule_GetPointer( + array_capsule, 'arrow_device_array' + ) + + with nogil: + array = GetResultValue(ImportDeviceArray(c_array, c_schema)) + + return pyarrow_wrap_array(array) + + def __dlpack__(self, stream=None): + """Export a primitive array as a DLPack capsule. + + Parameters + ---------- + stream : int, optional + A Python integer representing a pointer to a stream. Currently not supported. + Stream is provided by the consumer to the producer to instruct the producer + to ensure that operations can safely be performed on the array. + + Returns + ------- + capsule : PyCapsule + A DLPack capsule for the array, pointing to a DLManagedTensor. + """ + if stream is None: + dlm_tensor = GetResultValue(ExportToDLPack(self.sp_array)) + + return PyCapsule_New(dlm_tensor, 'dltensor', dlpack_pycapsule_deleter) + else: + raise NotImplementedError( + "Only stream=None is supported." + ) + + def __dlpack_device__(self): + """ + Return the DLPack device tuple this arrays resides on. + + Returns + ------- + tuple : Tuple[int, int] + Tuple with index specifying the type of the device (where + CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the + device which is 0 by default for CPU. + """ + device = GetResultValue(ExportDevice(self.sp_array)) + return device.device_type, device.device_id + + @property + def device_type(self): + """ + The device type where the array resides. + + Returns + ------- + DeviceAllocationType + """ + return _wrap_device_allocation_type(self.sp_array.get().device_type()) + + @property + def is_cpu(self): + """ + Whether the array is CPU-accessible. + """ + return self.device_type == DeviceAllocationType.CPU + + cdef void _assert_cpu(self) except *: + if self.sp_array.get().device_type() != CDeviceAllocationType_kCPU: + raise NotImplementedError("Implemented only for data on CPU device") + + +cdef _array_like_to_pandas(obj, options, types_mapper): + cdef: + PyObject* out + PandasOptions c_options = _convert_pandas_options(options) + + original_type = obj.type + name = obj._name + dtype = None + + if types_mapper: + dtype = types_mapper(original_type) + elif original_type.id == _Type_EXTENSION: + try: + dtype = original_type.to_pandas_dtype() + except NotImplementedError: + pass + + # Only call __from_arrow__ for Arrow extension types or when explicitly + # overridden via types_mapper + if hasattr(dtype, '__from_arrow__'): + arr = dtype.__from_arrow__(obj) + return pandas_api.series(arr, name=name, copy=False) + + if pandas_api.is_v1(): + # ARROW-3789: Coerce date/timestamp types to datetime64[ns] + c_options.coerce_temporal_nanoseconds = True + + if isinstance(obj, Array): + with nogil: + check_status(ConvertArrayToPandas(c_options, + ( obj).sp_array, + obj, &out)) + elif isinstance(obj, ChunkedArray): + with nogil: + check_status(libarrow_python.ConvertChunkedArrayToPandas( + c_options, + ( obj).sp_chunked_array, + obj, &out)) + + arr = wrap_array_output(out) + + if (isinstance(original_type, TimestampType) and + options["timestamp_as_object"]): + # ARROW-5359 - need to specify object dtype to avoid pandas to + # coerce back to ns resolution + dtype = "object" + elif types_mapper: + dtype = types_mapper(original_type) + else: + dtype = None + + result = pandas_api.series(arr, dtype=dtype, name=name, copy=False) + + if (isinstance(original_type, TimestampType) and + original_type.tz is not None and + # can be object dtype for non-ns and timestamp_as_object=True + result.dtype.kind == "M"): + from pyarrow.pandas_compat import make_tz_aware + result = make_tz_aware(result, original_type.tz) + + return result + + +cdef wrap_array_output(PyObject* output): + cdef object obj = PyObject_to_object(output) + + if isinstance(obj, dict): + return _pandas_api.categorical_type.from_codes( + obj['indices'], categories=obj['dictionary'], ordered=obj['ordered'] + ) + else: + return obj + + +cdef class NullArray(Array): + """ + Concrete class for Arrow arrays of null data type. + """ + + +cdef class BooleanArray(Array): + """ + Concrete class for Arrow arrays of boolean data type. + """ + @property + def false_count(self): + return ( self.ap).false_count() + + @property + def true_count(self): + return ( self.ap).true_count() + + +cdef class NumericArray(Array): + """ + A base class for Arrow numeric arrays. + """ + + +cdef class IntegerArray(NumericArray): + """ + A base class for Arrow integer arrays. + """ + + +cdef class FloatingPointArray(NumericArray): + """ + A base class for Arrow floating-point arrays. + """ + + +cdef class Int8Array(IntegerArray): + """ + Concrete class for Arrow arrays of int8 data type. + """ + + +cdef class UInt8Array(IntegerArray): + """ + Concrete class for Arrow arrays of uint8 data type. + """ + + +cdef class Int16Array(IntegerArray): + """ + Concrete class for Arrow arrays of int16 data type. + """ + + +cdef class UInt16Array(IntegerArray): + """ + Concrete class for Arrow arrays of uint16 data type. + """ + + +cdef class Int32Array(IntegerArray): + """ + Concrete class for Arrow arrays of int32 data type. + """ + + +cdef class UInt32Array(IntegerArray): + """ + Concrete class for Arrow arrays of uint32 data type. + """ + + +cdef class Int64Array(IntegerArray): + """ + Concrete class for Arrow arrays of int64 data type. + """ + + +cdef class UInt64Array(IntegerArray): + """ + Concrete class for Arrow arrays of uint64 data type. + """ + + +cdef class Date32Array(NumericArray): + """ + Concrete class for Arrow arrays of date32 data type. + """ + + +cdef class Date64Array(NumericArray): + """ + Concrete class for Arrow arrays of date64 data type. + """ + + +cdef class TimestampArray(NumericArray): + """ + Concrete class for Arrow arrays of timestamp data type. + """ + + +cdef class Time32Array(NumericArray): + """ + Concrete class for Arrow arrays of time32 data type. + """ + + +cdef class Time64Array(NumericArray): + """ + Concrete class for Arrow arrays of time64 data type. + """ + + +cdef class DurationArray(NumericArray): + """ + Concrete class for Arrow arrays of duration data type. + """ + + +cdef class MonthDayNanoIntervalArray(Array): + """ + Concrete class for Arrow arrays of interval[MonthDayNano] type. + """ + + def to_pylist(self): + """ + Convert to a list of native Python objects. + + pyarrow.MonthDayNano is used as the native representation. + + Returns + ------- + lst : list + """ + cdef: + CResult[PyObject*] maybe_py_list + PyObject* py_list + CMonthDayNanoIntervalArray* array + array = self.sp_array.get() + maybe_py_list = MonthDayNanoIntervalArrayToPyList(deref(array)) + py_list = GetResultValue(maybe_py_list) + return PyObject_to_object(py_list) + + +cdef class HalfFloatArray(FloatingPointArray): + """ + Concrete class for Arrow arrays of float16 data type. + """ + + +cdef class FloatArray(FloatingPointArray): + """ + Concrete class for Arrow arrays of float32 data type. + """ + + +cdef class DoubleArray(FloatingPointArray): + """ + Concrete class for Arrow arrays of float64 data type. + """ + + +cdef class FixedSizeBinaryArray(Array): + """ + Concrete class for Arrow arrays of a fixed-size binary data type. + """ + + +cdef class Decimal128Array(FixedSizeBinaryArray): + """ + Concrete class for Arrow arrays of decimal128 data type. + """ + + +cdef class Decimal256Array(FixedSizeBinaryArray): + """ + Concrete class for Arrow arrays of decimal256 data type. + """ + +cdef class BaseListArray(Array): + + def flatten(self, recursive=False): + """ + Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray + according to 'recursive'. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + recursive : bool, default False, optional + When True, flatten this logical list-array recursively until an + array of non-list values is formed. + + When False, flatten only the top level. + + Returns + ------- + result : Array + + Examples + -------- + + Basic logical list-array's flatten + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + + When recursive=True, nested list arrays are flattened recursively + until an array of non-list values is formed. + + >>> array = pa.array([ + ... None, + ... [ + ... [1, None, 2], + ... None, + ... [3, 4] + ... ], + ... [], + ... [ + ... [], + ... [5, 6], + ... None + ... ], + ... [ + ... [7, 8] + ... ] + ... ], type=pa.list_(pa.list_(pa.int64()))) + >>> array.flatten(True) + + [ + 1, + null, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ] + """ + options = _pc().ListFlattenOptions(recursive) + return _pc().list_flatten(self, options=options) + + def value_parent_indices(self): + """ + Return array of same length as list child values array where each + output value is the index of the parent list array slot containing each + child value. + + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array([[1, 2, 3], [], None, [4]], + ... type=pa.list_(pa.int32())) + >>> arr.value_parent_indices() + + [ + 0, + 0, + 0, + 3 + ] + """ + return _pc().list_parent_indices(self) + + def value_lengths(self): + """ + Return integers array with values equal to the respective length of + each list element. Null list values are null in the output. + + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array([[1, 2, 3], [], None, [4]], + ... type=pa.list_(pa.int32())) + >>> arr.value_lengths() + + [ + 3, + 0, + null, + 1 + ] + """ + return _pc().list_value_length(self) + + +cdef class ListArray(BaseListArray): + """ + Concrete class for Arrow arrays of a list data type. + """ + + @staticmethod + def from_arrays(offsets, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct ListArray from arrays of int32 offsets and values. + + Parameters + ---------- + offsets : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : ListArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + >>> # nulls in the offsets array become null lists + >>> offsets = pa.array([0, None, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CListArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CListArray.FromArrays( + _offsets.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the ListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + ListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, None, 6]]) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + """ + cdef CListArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, 5]]) + >>> array.offsets + + [ + 0, + 2, + 2, + 5 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + +cdef class LargeListArray(BaseListArray): + """ + Concrete class for Arrow arrays of a large list data type. + + Identical to ListArray, but 64-bit offsets. + """ + + @staticmethod + def from_arrays(offsets, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct LargeListArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : LargeListArray + """ + cdef: + Array _offsets, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int64') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CLargeListArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CLargeListArray.FromArrays( + _offsets.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from the sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array( + ... [[1, 2], None, [3, 4, None, 6]], + ... type=pa.large_list(pa.int32()), + ... ) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + """ + cdef CLargeListArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + +cdef class ListViewArray(BaseListArray): + """ + Concrete class for Arrow arrays of a list view data type. + """ + + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _sizes = asarray(sizes, type='int32') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + +cdef class LargeListViewArray(BaseListArray): + """ + Concrete class for Arrow arrays of a large list view data type. + + Identical to ListViewArray, but with 64-bit offsets. + """ + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int64') + _sizes = asarray(sizes, type='int64') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CLargeListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + +cdef class MapArray(ListArray): + """ + Concrete class for Arrow arrays of a map data type. + """ + + @staticmethod + def from_arrays(offsets, keys, items, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct MapArray from arrays of int32 offsets and key, item arrays. + + Parameters + ---------- + offsets : array-like or sequence (int32 type) + keys : array-like or sequence (any type) + items : array-like or sequence (any type) + type : DataType, optional + If not specified, a default MapArray with the keys' and items' type is used. + pool : MemoryPool + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + map_array : MapArray + + Examples + -------- + First, let's understand the structure of our dataset when viewed in a rectangular data model. + The total of 5 respondents answered the question "How much did you like the movie x?". + The value -1 in the integer array means that the value is missing. The boolean array + represents the null bitmask corresponding to the missing values in the integer array. + + >>> import pyarrow as pa + >>> movies_rectangular = np.ma.masked_array([ + ... [10, -1, -1], + ... [8, 4, 5], + ... [-1, 10, 3], + ... [-1, -1, -1], + ... [-1, -1, -1] + ... ], + ... [ + ... [False, True, True], + ... [False, False, False], + ... [True, False, False], + ... [True, True, True], + ... [True, True, True], + ... ]) + + To represent the same data with the MapArray and from_arrays, the data is + formed like this: + + >>> offsets = [ + ... 0, # -- row 1 start + ... 1, # -- row 2 start + ... 4, # -- row 3 start + ... 6, # -- row 4 start + ... 6, # -- row 5 start + ... 6, # -- row 5 end + ... ] + >>> movies = [ + ... "Dark Knight", # ---------------------------------- row 1 + ... "Dark Knight", "Meet the Parents", "Superman", # -- row 2 + ... "Meet the Parents", "Superman", # ----------------- row 3 + ... ] + >>> likings = [ + ... 10, # -------- row 1 + ... 8, 4, 5, # --- row 2 + ... 10, 3 # ------ row 3 + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 [] + 4 [] + dtype: object + + If the data in the empty rows needs to be marked as missing, it's possible + to do so by modifying the offsets argument, so that we specify `None` as + the starting positions of the rows we want marked as missing. The end row + offset still has to refer to the existing value from keys (and values): + + >>> offsets = [ + ... 0, # ----- row 1 start + ... 1, # ----- row 2 start + ... 4, # ----- row 3 start + ... None, # -- row 4 start + ... None, # -- row 5 start + ... 6, # ----- row 5 end + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 None + 4 None + dtype: object + """ + cdef: + Array _offsets, _keys, _items + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _keys = asarray(keys) + _items = asarray(items) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CMapArray.FromArraysAndType( + type.sp_type, _offsets.sp_array, + _keys.sp_array, _items.sp_array, cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CMapArray.FromArrays(_offsets.sp_array, + _keys.sp_array, + _items.sp_array, cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def keys(self): + """Flattened array of keys across all maps in array""" + return pyarrow_wrap_array(( self.ap).keys()) + + @property + def items(self): + """Flattened array of items across all maps in array""" + return pyarrow_wrap_array(( self.ap).items()) + + +cdef class FixedSizeListArray(BaseListArray): + """ + Concrete class for Arrow arrays of a fixed size list data type. + """ + + @staticmethod + def from_arrays(values, list_size=None, DataType type=None, mask=None): + """ + Construct FixedSizeListArray from array of values and a list length. + + Parameters + ---------- + values : Array (any type) + list_size : int + The fixed length of the lists. + type : DataType, optional + If not specified, a default ListType with the values' type and + `list_size` length is used. + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + + Returns + ------- + FixedSizeListArray + + Examples + -------- + + Create from a values array and a list size: + + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> arr = pa.FixedSizeListArray.from_arrays(values, 2) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + + Or create from a values array, list size and matching type: + + >>> typ = pa.list_(pa.field("values", pa.int64()), 2) + >>> arr = pa.FixedSizeListArray.from_arrays(values,type=typ) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ + cdef: + Array _values + int32_t _list_size + CResult[shared_ptr[CArray]] c_result + + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, None) + + if type is not None: + if list_size is not None: + raise ValueError("Cannot specify both list_size and type") + with nogil: + c_result = CFixedSizeListArray.FromArraysAndType( + _values.sp_array, type.sp_type, c_mask) + else: + if list_size is None: + raise ValueError("Should specify one of list_size and type") + _list_size = list_size + with nogil: + c_result = CFixedSizeListArray.FromArrays( + _values.sp_array, _list_size, c_mask) + cdef Array result = pyarrow_wrap_array(GetResultValue(c_result)) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the + FixedSizeListArray. + + Note even null elements are included. + + Compare with :meth:`flatten`, which returns only the non-null + sub-list values. + + Returns + ------- + values : Array + + See Also + -------- + FixedSizeListArray.flatten : ... + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array( + ... [[1, 2], None, [3, None]], + ... type=pa.list_(pa.int32(), 2) + ... ) + >>> array.values + + [ + 1, + 2, + null, + null, + 3, + null + ] + + """ + cdef CFixedSizeListArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + +cdef class UnionArray(Array): + """ + Concrete class for Arrow arrays of a Union data type. + """ + + def child(self, int pos): + """ + DEPRECATED, use field() instead. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : pyarrow.Field + The given child field. + """ + import warnings + warnings.warn("child is deprecated, use field", FutureWarning) + return self.field(pos) + + def field(self, int pos): + """ + Return the given child field as an individual array. + + For sparse unions, the returned array has its offset, length, + and null count adjusted. + + For dense unions, the returned array is unchanged. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : Array + The given child field. + """ + cdef shared_ptr[CArray] result + result = ( self.ap).field(pos) + if result != NULL: + return pyarrow_wrap_array(result) + raise KeyError("UnionArray does not have child {}".format(pos)) + + @property + def type_codes(self): + """Get the type codes array.""" + buf = pyarrow_wrap_buffer(( self.ap).type_codes()) + return Array.from_buffers(int8(), len(self), [None, buf]) + + @property + def offsets(self): + """ + Get the value offsets array (dense arrays only). + + Does not account for any slice offset. + """ + if self.type.mode != "dense": + raise ArrowTypeError("Can only get value offsets for dense arrays") + cdef CDenseUnionArray* dense = self.ap + buf = pyarrow_wrap_buffer(dense.value_offsets()) + return Array.from_buffers(int32(), len(self), [None, buf]) + + @staticmethod + def from_dense(Array types, Array value_offsets, list children, + list field_names=None, list type_codes=None): + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ + cdef: + shared_ptr[CArray] out + vector[shared_ptr[CArray]] c + Array child + vector[c_string] c_field_names + vector[int8_t] c_type_codes + + for child in children: + c.push_back(child.sp_array) + if field_names is not None: + for x in field_names: + c_field_names.push_back(tobytes(x)) + if type_codes is not None: + for x in type_codes: + c_type_codes.push_back(x) + + with nogil: + out = GetResultValue(CDenseUnionArray.Make( + deref(types.ap), deref(value_offsets.ap), c, c_field_names, + c_type_codes)) + + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @staticmethod + def from_sparse(Array types, list children, list field_names=None, + list type_codes=None): + """ + Construct sparse UnionArray from arrays of int8 types and children + arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ + cdef: + shared_ptr[CArray] out + vector[shared_ptr[CArray]] c + Array child + vector[c_string] c_field_names + vector[int8_t] c_type_codes + + for child in children: + c.push_back(child.sp_array) + if field_names is not None: + for x in field_names: + c_field_names.push_back(tobytes(x)) + if type_codes is not None: + for x in type_codes: + c_type_codes.push_back(x) + + with nogil: + out = GetResultValue(CSparseUnionArray.Make( + deref(types.ap), c, c_field_names, c_type_codes)) + + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + +cdef class StringArray(Array): + """ + Concrete class for Arrow arrays of string (or utf8) data type. + """ + + @staticmethod + def from_buffers(int length, Buffer value_offsets, Buffer data, + Buffer null_bitmap=None, int null_count=-1, + int offset=0): + """ + Construct a StringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + return Array.from_buffers(utf8(), length, + [null_bitmap, value_offsets, data], + null_count, offset) + + +cdef class LargeStringArray(Array): + """ + Concrete class for Arrow arrays of large string (or utf8) data type. + """ + + @staticmethod + def from_buffers(int length, Buffer value_offsets, Buffer data, + Buffer null_bitmap=None, int null_count=-1, + int offset=0): + """ + Construct a LargeStringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + return Array.from_buffers(large_utf8(), length, + [null_bitmap, value_offsets, data], + null_count, offset) + + +cdef class StringViewArray(Array): + """ + Concrete class for Arrow arrays of string (or utf8) view data type. + """ + + +cdef class BinaryArray(Array): + """ + Concrete class for Arrow arrays of variable-sized binary data type. + """ + @property + def total_values_length(self): + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this BinaryArray. + """ + return ( self.ap).total_values_length() + + +cdef class LargeBinaryArray(Array): + """ + Concrete class for Arrow arrays of large variable-sized binary data type. + """ + @property + def total_values_length(self): + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this LargeBinaryArray. + """ + return ( self.ap).total_values_length() + + +cdef class BinaryViewArray(Array): + """ + Concrete class for Arrow arrays of variable-sized binary view data type. + """ + + +cdef class DictionaryArray(Array): + """ + Concrete class for dictionary-encoded Arrow arrays. + """ + + def dictionary_encode(self): + return self + + def dictionary_decode(self): + """ + Decodes the DictionaryArray to an Array. + """ + return self.dictionary.take(self.indices) + + @property + def dictionary(self): + cdef CDictionaryArray* darr = (self.ap) + + if self._dictionary is None: + self._dictionary = pyarrow_wrap_array(darr.dictionary()) + + return self._dictionary + + @property + def indices(self): + cdef CDictionaryArray* darr = (self.ap) + + if self._indices is None: + self._indices = pyarrow_wrap_array(darr.indices()) + + return self._indices + + @staticmethod + def from_buffers(DataType type, int64_t length, buffers, Array dictionary, + int64_t null_count=-1, int64_t offset=0): + """ + Construct a DictionaryArray from buffers. + + Parameters + ---------- + type : pyarrow.DataType + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing the indices array. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + null_count : int, default -1 + The number of null entries in the indices array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + + Returns + ------- + dict_array : DictionaryArray + """ + cdef: + vector[shared_ptr[CBuffer]] c_buffers + shared_ptr[CDataType] c_type + shared_ptr[CArrayData] c_data + shared_ptr[CArray] c_result + + for buf in buffers: + c_buffers.push_back(pyarrow_unwrap_buffer(buf)) + + c_type = pyarrow_unwrap_data_type(type) + + with nogil: + c_data = CArrayData.Make( + c_type, length, c_buffers, null_count, offset) + c_data.get().dictionary = dictionary.sp_array.get().data() + c_result.reset(new CDictionaryArray(c_data)) + + cdef Array result = pyarrow_wrap_array(c_result) + result.validate() + return result + + @staticmethod + def from_arrays(indices, dictionary, mask=None, bint ordered=False, + bint from_pandas=False, bint safe=True, + MemoryPool memory_pool=None): + """ + Construct a DictionaryArray from indices and values. + + Parameters + ---------- + indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type + Non-negative integers referencing the dictionary values by zero + based index. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + mask : ndarray or pandas.Series, bool type + True values indicate that indices are actually null. + ordered : bool, default False + Set to True if the category values are ordered. + from_pandas : bool, default False + If True, the indices should be treated as though they originated in + a pandas.Categorical (null encoded as -1). + safe : bool, default True + If True, check that the dictionary indices are in range. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise uses default pool. + + Returns + ------- + dict_array : DictionaryArray + """ + cdef: + Array _indices, _dictionary + shared_ptr[CDataType] c_type + shared_ptr[CArray] c_result + + if isinstance(indices, Array): + if mask is not None: + raise NotImplementedError( + "mask not implemented with Arrow array inputs yet") + _indices = indices + else: + if from_pandas: + _indices = _codes_to_indices(indices, mask, None, memory_pool) + else: + _indices = array(indices, mask=mask, memory_pool=memory_pool) + + if isinstance(dictionary, Array): + _dictionary = dictionary + else: + _dictionary = array(dictionary, memory_pool=memory_pool) + + if not isinstance(_indices, IntegerArray): + raise ValueError('Indices must be integer type') + + cdef c_bool c_ordered = ordered + + c_type.reset(new CDictionaryType(_indices.type.sp_type, + _dictionary.sp_array.get().type(), + c_ordered)) + + if safe: + with nogil: + c_result = GetResultValue( + CDictionaryArray.FromArrays(c_type, _indices.sp_array, + _dictionary.sp_array)) + else: + c_result.reset(new CDictionaryArray(c_type, _indices.sp_array, + _dictionary.sp_array)) + + cdef Array result = pyarrow_wrap_array(c_result) + result.validate() + return result + + +cdef class StructArray(Array): + """ + Concrete class for Arrow arrays of a struct data type. + """ + + def field(self, index): + """ + Retrieves the child array belonging to field. + + Parameters + ---------- + index : Union[int, str] + Index / position or name of the field. + + Returns + ------- + result : Array + """ + cdef: + CStructArray* arr = self.ap + shared_ptr[CArray] child + + if isinstance(index, (bytes, str)): + child = arr.GetFieldByName(tobytes(index)) + if child == nullptr: + raise KeyError(index) + elif isinstance(index, int): + child = arr.field( + _normalize_index(index, self.ap.num_fields())) + else: + raise TypeError('Expected integer or string index') + + return pyarrow_wrap_array(child) + + def _flattened_field(self, index, MemoryPool memory_pool=None): + """ + Retrieves the child array belonging to field, + accounting for the parent array null bitmap. + + Parameters + ---------- + index : Union[int, str] + Index / position or name of the field. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + result : Array + """ + cdef: + CStructArray* arr = self.ap + shared_ptr[CArray] child + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + if isinstance(index, (bytes, str)): + int_index = self.type.get_field_index(index) + if int_index < 0: + raise KeyError(index) + elif isinstance(index, int): + int_index = _normalize_index(index, self.ap.num_fields()) + else: + raise TypeError('Expected integer or string index') + + child = GetResultValue(arr.GetFlattenedField(int_index, pool)) + return pyarrow_wrap_array(child) + + def flatten(self, MemoryPool memory_pool=None): + """ + Return one individual array for each field in the struct. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + result : List[Array] + """ + cdef: + vector[shared_ptr[CArray]] arrays + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + CStructArray* sarr = self.ap + + with nogil: + arrays = GetResultValue(sarr.Flatten(pool)) + + return [pyarrow_wrap_array(arr) for arr in arrays] + + @staticmethod + def from_arrays(arrays, names=None, fields=None, mask=None, + memory_pool=None, type=None): + """ + Construct StructArray from collection of arrays representing + each field in the struct. + + Either field names, field instances or a struct type must be passed. + + Parameters + ---------- + arrays : sequence of Array + names : List[str] (optional) + Field names for each struct child. + fields : List[Field] (optional) + Field instances for each struct child. + mask : pyarrow.Array[bool] (optional) + Indicate which values are null (True) or not null (False). + memory_pool : MemoryPool (optional) + For memory allocations, if required, otherwise uses default pool. + type : pyarrow.StructType (optional) + Struct type for name and type of each child. + + Returns + ------- + result : StructArray + """ + cdef: + shared_ptr[CArray] c_array + shared_ptr[CBuffer] c_mask + vector[shared_ptr[CArray]] c_arrays + vector[c_string] c_names + vector[shared_ptr[CField]] c_fields + CResult[shared_ptr[CArray]] c_result + ssize_t num_arrays + ssize_t length + ssize_t i + Field py_field + DataType struct_type + + if fields is not None and type is not None: + raise ValueError('Must pass either fields or type, not both') + + if type is not None: + fields = [] + for field in type: + fields.append(field) + + if names is None and fields is None: + raise ValueError('Must pass either names or fields') + if names is not None and fields is not None: + raise ValueError('Must pass either names or fields, not both') + + c_mask = c_mask_inverted_from_obj(mask, memory_pool) + + arrays = [asarray(x) for x in arrays] + for arr in arrays: + c_array = pyarrow_unwrap_array(arr) + if c_array == nullptr: + raise TypeError(f"Expected Array, got {arr.__class__}") + c_arrays.push_back(c_array) + if names is not None: + for name in names: + c_names.push_back(tobytes(name)) + else: + for item in fields: + if isinstance(item, tuple): + py_field = field(*item) + else: + py_field = item + c_fields.push_back(py_field.sp_field) + + if (c_arrays.size() == 0 and c_names.size() == 0 and + c_fields.size() == 0): + # The C++ side doesn't allow this + if mask is None: + return array([], struct([])) + else: + return array([{}] * len(mask), struct([]), mask=mask) + + if names is not None: + # XXX Cannot pass "nullptr" for a shared_ptr argument: + # https://github.com/cython/cython/issues/3020 + c_result = CStructArray.MakeFromFieldNames( + c_arrays, c_names, c_mask, -1, 0) + else: + c_result = CStructArray.MakeFromFields( + c_arrays, c_fields, c_mask, -1, 0) + cdef Array result = pyarrow_wrap_array(GetResultValue(c_result)) + result.validate() + return result + + def sort(self, order="ascending", by=None, **kwargs): + """ + Sort the StructArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + by : str or None, default None + If to sort the array by one of its fields + or by the whole array. + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : StructArray + """ + if by is not None: + tosort, sort_keys = self._flattened_field(by), [("", order)] + else: + tosort, sort_keys = self, [(field.name, order) for field in self.type] + indices = _pc().sort_indices( + tosort, options=_pc().SortOptions(sort_keys=sort_keys, **kwargs) + ) + return self.take(indices) + + +cdef class RunEndEncodedArray(Array): + """ + Concrete class for Arrow run-end encoded arrays. + """ + + @staticmethod + def _from_arrays(type, allow_none_for_type, logical_length, run_ends, values, logical_offset): + cdef: + int64_t _logical_length + Array _run_ends + Array _values + int64_t _logical_offset + shared_ptr[CDataType] c_type + shared_ptr[CRunEndEncodedArray] ree_array + + _logical_length = logical_length + _logical_offset = logical_offset + + type = ensure_type(type, allow_none=allow_none_for_type) + if type is not None: + _run_ends = asarray(run_ends, type=type.run_end_type) + _values = asarray(values, type=type.value_type) + c_type = pyarrow_unwrap_data_type(type) + with nogil: + ree_array = GetResultValue(CRunEndEncodedArray.Make( + c_type, _logical_length, _run_ends.sp_array, _values.sp_array, _logical_offset)) + else: + _run_ends = asarray(run_ends) + _values = asarray(values) + with nogil: + ree_array = GetResultValue(CRunEndEncodedArray.MakeFromArrays( + _logical_length, _run_ends.sp_array, _values.sp_array, _logical_offset)) + cdef Array result = pyarrow_wrap_array(ree_array) + result.validate(full=True) + return result + + @staticmethod + def from_arrays(run_ends, values, type=None): + """ + Construct RunEndEncodedArray from run_ends and values arrays. + + Parameters + ---------- + run_ends : Array (int16, int32, or int64 type) + The run_ends array. + values : Array (any type) + The values array. + type : pyarrow.DataType, optional + The run_end_encoded(run_end_type, value_type) array type. + + Returns + ------- + RunEndEncodedArray + """ + logical_length = scalar(run_ends[-1]).as_py() if len(run_ends) > 0 else 0 + return RunEndEncodedArray._from_arrays(type, True, logical_length, + run_ends, values, 0) + + @staticmethod + def from_buffers(DataType type, length, buffers, null_count=-1, offset=0, + children=None): + """ + Construct a RunEndEncodedArray from all the parameters that make up an + Array. + + RunEndEncodedArrays do not have buffers, only children arrays, but this + implementation is needed to satisfy the Array interface. + + Parameters + ---------- + type : DataType + The run_end_encoded(run_end_type, value_type) type. + length : int + The logical length of the run-end encoded array. Expected to match + the last value of the run_ends array (children[0]) minus the offset. + buffers : List[Buffer] + Empty List or [None]. + null_count : int, default -1 + The number of null entries in the array. Run-end encoded arrays + are specified to not have valid bits and null_count always equals 0. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array] + Nested type children containing the run_ends and values arrays. + + Returns + ------- + RunEndEncodedArray + """ + children = children or [] + + if type.num_fields != len(children): + raise ValueError("RunEndEncodedType's expected number of children " + "({0}) did not match the passed number " + "({1}).".format(type.num_fields, len(children))) + + # buffers are validated as if we needed to pass them to C++, but + # _make_from_arrays will take care of filling in the expected + # buffers array containing a single NULL buffer on the C++ side + if len(buffers) == 0: + buffers = [None] + if buffers[0] is not None: + raise ValueError("RunEndEncodedType expects None as validity " + "bitmap, buffers[0] is not None") + if type.num_buffers != len(buffers): + raise ValueError("RunEndEncodedType's expected number of buffers " + "({0}) did not match the passed number " + "({1}).".format(type.num_buffers, len(buffers))) + + # null_count is also validated as if we needed it + if null_count != -1 and null_count != 0: + raise ValueError("RunEndEncodedType's expected null_count (0) " + "did not match passed number ({0})".format(null_count)) + + return RunEndEncodedArray._from_arrays(type, False, length, children[0], + children[1], offset) + + @property + def run_ends(self): + """ + An array holding the logical indexes of each run-end. + + The physical offset to the array is applied. + """ + cdef CRunEndEncodedArray* ree_array = (self.ap) + return pyarrow_wrap_array(ree_array.run_ends()) + + @property + def values(self): + """ + An array holding the values of each run. + + The physical offset to the array is applied. + """ + cdef CRunEndEncodedArray* ree_array = (self.ap) + return pyarrow_wrap_array(ree_array.values()) + + def find_physical_offset(self): + """ + Find the physical offset of this REE array. + + This is the offset of the run that contains the value of the first + logical element of this array considering its offset. + + This function uses binary-search, so it has a O(log N) cost. + """ + cdef CRunEndEncodedArray* ree_array = (self.ap) + return ree_array.FindPhysicalOffset() + + def find_physical_length(self): + """ + Find the physical length of this REE array. + + The physical length of an REE is the number of physical values (and + run-ends) necessary to represent the logical range of values from offset + to length. + + This function uses binary-search, so it has a O(log N) cost. + """ + cdef CRunEndEncodedArray* ree_array = (self.ap) + return ree_array.FindPhysicalLength() + + +cdef class ExtensionArray(Array): + """ + Concrete class for Arrow extension arrays. + """ + + @property + def storage(self): + cdef: + CExtensionArray* ext_array = (self.ap) + + return pyarrow_wrap_array(ext_array.storage()) + + @staticmethod + def from_storage(BaseExtensionType typ, Array storage): + """ + Construct ExtensionArray from type and storage array. + + Parameters + ---------- + typ : DataType + The extension type for the result array. + storage : Array + The underlying storage for the result array. + + Returns + ------- + ext_array : ExtensionArray + """ + cdef: + shared_ptr[CExtensionArray] ext_array + + if storage.type != typ.storage_type: + raise TypeError("Incompatible storage type {0} " + "for extension type {1}".format(storage.type, typ)) + + ext_array = make_shared[CExtensionArray](typ.sp_type, storage.sp_array) + cdef Array result = pyarrow_wrap_array( ext_array) + result.validate() + return result + + +class UuidArray(ExtensionArray): + """ + Concrete class for Arrow arrays of UUID data type. + """ + + +cdef class FixedShapeTensorArray(ExtensionArray): + """ + Concrete class for fixed shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + + Create an extension array + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> pa.ExtensionArray.from_storage(tensor_type, storage) + + [ + [ + 1, + 2, + 3, + 4 + ], + [ + 10, + 20, + 30, + 40 + ], + [ + 100, + 200, + 300, + 400 + ] + ] + """ + + def to_numpy_ndarray(self): + """ + Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. + + The resulting ndarray will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + Ndarray representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert fixed shape tensor extension array to a pyarrow.Tensor. + + The resulting Tensor will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + + cdef: + CFixedShapeTensorArray* ext_array = (self.ap) + CResult[shared_ptr[CTensor]] ctensor + with nogil: + ctensor = ext_array.ToTensor() + return pyarrow_wrap_tensor(GetResultValue(ctensor)) + + @staticmethod + def from_numpy_ndarray(obj): + """ + Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. + The first dimension of ndarray will become the length of the fixed + shape tensor array. + If input array data is not contiguous a copy will be made. + + Parameters + ---------- + obj : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array( + ... [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], + ... dtype=np.float32) + >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 1, + 2, + 3, + 4, + 5, + 6 + ] + ] + """ + + if len(obj.shape) < 2: + raise ValueError( + "Cannot convert 1D array or scalar to fixed shape tensor array") + if np.prod(obj.shape) == 0: + raise ValueError("Expected a non-empty ndarray") + + permutation = (-np.array(obj.strides)).argsort(kind='stable') + if permutation[0] != 0: + raise ValueError('First stride needs to be largest to ensure that ' + 'individual tensor data is contiguous in memory.') + + arrow_type = from_numpy_dtype(obj.dtype) + shape = np.take(obj.shape, permutation) + values = np.ravel(obj, order="K") + + return ExtensionArray.from_storage( + fixed_shape_tensor(arrow_type, shape[1:], permutation=permutation[1:] - 1), + FixedSizeListArray.from_arrays(values, shape[1:].prod()) + ) + + +cdef class OpaqueArray(ExtensionArray): + """ + Concrete class for opaque extension arrays. + + Examples + -------- + Define the extension type for an opaque array + + >>> import pyarrow as pa + >>> opaque_type = pa.opaque( + ... pa.binary(), + ... type_name="geometry", + ... vendor_name="postgis", + ... ) + + Create an extension array + + >>> arr = [None, b"data"] + >>> storage = pa.array(arr, pa.binary()) + >>> pa.ExtensionArray.from_storage(opaque_type, storage) + + [ + null, + 64617461 + ] + """ + + +cdef class Bool8Array(ExtensionArray): + """ + Concrete class for bool8 extension arrays. + + Examples + -------- + Define the extension type for an bool8 array + + >>> import pyarrow as pa + >>> bool8_type = pa.bool8() + + Create an extension array + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> pa.ExtensionArray.from_storage(bool8_type, storage) + + [ + -1, + 0, + 1, + 2, + null + ] + """ + + def to_numpy(self, zero_copy_only=True, writable=False): + """ + Return a NumPy bool view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for arrays without any nulls. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + if not writable: + try: + return self.storage.to_numpy().view(np.bool_) + except ArrowInvalid as e: + if zero_copy_only: + raise e + + return _pc().not_equal(self.storage, 0).to_numpy(zero_copy_only=zero_copy_only, writable=writable) + + @staticmethod + def from_storage(Int8Array storage): + """ + Construct Bool8Array from Int8Array storage. + + Parameters + ---------- + storage : Int8Array + The underlying storage for the result array. + + Returns + ------- + bool8_array : Bool8Array + """ + return ExtensionArray.from_storage(bool8(), storage) + + @staticmethod + def from_numpy(obj): + """ + Convert numpy array to a bool8 extension array without making a copy. + The input array must be 1-dimensional, with either bool_ or int8 dtype. + + Parameters + ---------- + obj : numpy.ndarray + + Returns + ------- + bool8_array : Bool8Array + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([True, False, True], dtype=np.bool_) + >>> pa.Bool8Array.from_numpy(arr) + + [ + 1, + 0, + 1 + ] + """ + + if obj.ndim != 1: + raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8 array") + + if obj.dtype not in [np.bool_, np.int8]: + raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8 storage") + + storage_arr = array(obj.view(np.int8), type=int8()) + return Bool8Array.from_storage(storage_arr) + + +cdef dict _array_classes = { + _Type_NA: NullArray, + _Type_BOOL: BooleanArray, + _Type_UINT8: UInt8Array, + _Type_UINT16: UInt16Array, + _Type_UINT32: UInt32Array, + _Type_UINT64: UInt64Array, + _Type_INT8: Int8Array, + _Type_INT16: Int16Array, + _Type_INT32: Int32Array, + _Type_INT64: Int64Array, + _Type_DATE32: Date32Array, + _Type_DATE64: Date64Array, + _Type_TIMESTAMP: TimestampArray, + _Type_TIME32: Time32Array, + _Type_TIME64: Time64Array, + _Type_DURATION: DurationArray, + _Type_INTERVAL_MONTH_DAY_NANO: MonthDayNanoIntervalArray, + _Type_HALF_FLOAT: HalfFloatArray, + _Type_FLOAT: FloatArray, + _Type_DOUBLE: DoubleArray, + _Type_LIST: ListArray, + _Type_LARGE_LIST: LargeListArray, + _Type_LIST_VIEW: ListViewArray, + _Type_LARGE_LIST_VIEW: LargeListViewArray, + _Type_MAP: MapArray, + _Type_FIXED_SIZE_LIST: FixedSizeListArray, + _Type_SPARSE_UNION: UnionArray, + _Type_DENSE_UNION: UnionArray, + _Type_BINARY: BinaryArray, + _Type_STRING: StringArray, + _Type_LARGE_BINARY: LargeBinaryArray, + _Type_LARGE_STRING: LargeStringArray, + _Type_BINARY_VIEW: BinaryViewArray, + _Type_STRING_VIEW: StringViewArray, + _Type_DICTIONARY: DictionaryArray, + _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, + _Type_DECIMAL128: Decimal128Array, + _Type_DECIMAL256: Decimal256Array, + _Type_STRUCT: StructArray, + _Type_RUN_END_ENCODED: RunEndEncodedArray, + _Type_EXTENSION: ExtensionArray, +} + + +cdef inline shared_ptr[CBuffer] c_mask_inverted_from_obj(object mask, MemoryPool pool) except *: + """ + Convert mask array obj to c_mask while also inverting to signify 1 for valid and 0 for null + """ + cdef shared_ptr[CBuffer] c_mask + if mask is None: + c_mask = shared_ptr[CBuffer]() + elif isinstance(mask, Array): + if mask.type.id != Type_BOOL: + raise TypeError('Mask must be a pyarrow.Array of type boolean') + if mask.null_count != 0: + raise ValueError('Mask must not contain nulls') + inverted_mask = _pc().invert(mask, memory_pool=pool) + c_mask = pyarrow_unwrap_buffer(inverted_mask.buffers()[1]) + else: + raise TypeError('Mask must be a pyarrow.Array of type boolean') + return c_mask + + +cdef object get_array_class_from_type( + const shared_ptr[CDataType]& sp_data_type): + cdef CDataType* data_type = sp_data_type.get() + if data_type == NULL: + raise ValueError('Array data type was NULL') + + if data_type.id() == _Type_EXTENSION: + py_ext_data_type = pyarrow_wrap_data_type(sp_data_type) + return py_ext_data_type.__arrow_ext_class__() + else: + return _array_classes[data_type.id()] + + +cdef object get_values(object obj, bint* is_series): + if pandas_api.is_series(obj) or pandas_api.is_index(obj): + result = pandas_api.get_values(obj) + is_series[0] = True + elif isinstance(obj, np.ndarray): + result = obj + is_series[0] = False + else: + result = pandas_api.series(obj, copy=False).values + is_series[0] = False + + return result + + +def concat_arrays(arrays, MemoryPool memory_pool=None): + """ + Concatenate the given arrays. + + The contents of the input arrays are copied into the returned array. + + Raises + ------ + ArrowInvalid + If not all of the arrays have the same type. + + Parameters + ---------- + arrays : iterable of pyarrow.Array + Arrays to concatenate, must be identically typed. + memory_pool : MemoryPool, default None + For memory allocations. If None, the default pool is used. + + Examples + -------- + >>> import pyarrow as pa + >>> arr1 = pa.array([2, 4, 5, 100]) + >>> arr2 = pa.array([2, 4]) + >>> pa.concat_arrays([arr1, arr2]) + + [ + 2, + 4, + 5, + 100, + 2, + 4 + ] + + """ + cdef: + vector[shared_ptr[CArray]] c_arrays + shared_ptr[CArray] c_concatenated + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + for array in arrays: + if not isinstance(array, Array): + raise TypeError("Iterable should contain Array objects, " + "got {0} instead".format(type(array))) + c_arrays.push_back(pyarrow_unwrap_array(array)) + + with nogil: + c_concatenated = GetResultValue(Concatenate(c_arrays, pool)) + + return pyarrow_wrap_array(c_concatenated) + + +def _empty_array(DataType type): + """ + Create empty array of the given type. + """ + if type.id == Type_DICTIONARY: + arr = DictionaryArray.from_arrays( + _empty_array(type.index_type), _empty_array(type.value_type), + ordered=type.ordered) + else: + arr = array([], type=type) + return arr diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/benchmark.pxi b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/benchmark.pxi new file mode 100644 index 0000000000000000000000000000000000000000..ab251017db78706c97c7dee8044636c55c80167e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/benchmark.pxi @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def benchmark_PandasObjectIsNull(list obj): + Benchmark_PandasObjectIsNull(obj) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/benchmark.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..25ee1141f08d1f4ac19ab7ade92eafbf786d685a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/benchmark.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + + +from pyarrow.lib import benchmark_PandasObjectIsNull diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/builder.pxi b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/builder.pxi new file mode 100644 index 0000000000000000000000000000000000000000..fbab5bbdb5a0113b107a0a7db029883dabaf7f78 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/builder.pxi @@ -0,0 +1,150 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import math + + +cdef class StringBuilder(_Weakrefable): + """ + Builder class for UTF8 strings. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string'). + """ + cdef: + unique_ptr[CStringBuilder] builder + + def __cinit__(self, MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + self.builder.reset(new CStringBuilder(pool)) + + def append(self, value): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + if isinstance(value, (bytes, str)): + self.builder.get().Append(tobytes(value)) + elif value is None or math.isnan(value): + self.builder.get().AppendNull() + else: + raise TypeError('StringBuilder only accepts string objects') + + def append_values(self, values): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + for value in values: + self.append(value) + + def finish(self): + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + @property + def null_count(self): + return self.builder.get().null_count() + + def __len__(self): + return self.builder.get().length() + + +cdef class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ + cdef: + unique_ptr[CStringViewBuilder] builder + + def __cinit__(self, MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + self.builder.reset(new CStringViewBuilder(pool)) + + def append(self, value): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + if isinstance(value, (bytes, str)): + self.builder.get().Append(tobytes(value)) + elif value is None or math.isnan(value): + self.builder.get().AppendNull() + else: + raise TypeError('StringViewBuilder only accepts string objects') + + def append_values(self, values): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + for value in values: + self.append(value) + + def finish(self): + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + @property + def null_count(self): + return self.builder.get().null_count() + + def __len__(self): + return self.builder.get().length() diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/cffi.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/cffi.py new file mode 100644 index 0000000000000000000000000000000000000000..1da1a916914049513b89c68bd60f08ba32b67edb --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/cffi.py @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +import cffi + +c_source = """ + struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; + }; + + struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; + }; + + struct ArrowArrayStream { + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback + void (*release)(struct ArrowArrayStream*); + // Opaque producer-specific data + void* private_data; + }; + + typedef int32_t ArrowDeviceType; + + struct ArrowDeviceArray { + struct ArrowArray array; + int64_t device_id; + ArrowDeviceType device_type; + void* sync_event; + int64_t reserved[3]; + }; + """ + +# TODO use out-of-line mode for faster import and avoid C parsing +ffi = cffi.FFI() +ffi.cdef(c_source) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/compat.pxi b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/compat.pxi new file mode 100644 index 0000000000000000000000000000000000000000..8cf106d5609b50dd84c082dcfd36aee5b16fbee4 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/compat.pxi @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def encode_file_path(path): + if isinstance(path, str): + # POSIX systems can handle utf-8. UTF8 is converted to utf16-le in + # libarrow + encoded_path = path.encode('utf-8') + else: + encoded_path = path + + # Windows file system requires utf-16le for file names; Arrow C++ libraries + # will convert utf8 to utf16 + return encoded_path + + +# Starting with Python 3.7, dicts are guaranteed to be insertion-ordered. +ordered_dict = dict + + +try: + import cloudpickle as pickle +except ImportError: + import pickle + + +def tobytes(o): + """ + Encode a unicode or bytes string to bytes. + + Parameters + ---------- + o : str or bytes + Input string. + """ + if isinstance(o, str): + return o.encode('utf8') + else: + return o + + +def frombytes(o, *, safe=False): + """ + Decode the given bytestring to unicode. + + Parameters + ---------- + o : bytes-like + Input object. + safe : bool, default False + If true, raise on encoding errors. + """ + if safe: + return o.decode('utf8', errors='replace') + else: + return o.decode('utf8') diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/compute.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/compute.py new file mode 100644 index 0000000000000000000000000000000000000000..83612f66d21e2f54bb1ac161de4d5db4463675ac --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/compute.py @@ -0,0 +1,732 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._compute import ( # noqa + Function, + FunctionOptions, + FunctionRegistry, + HashAggregateFunction, + HashAggregateKernel, + Kernel, + ScalarAggregateFunction, + ScalarAggregateKernel, + ScalarFunction, + ScalarKernel, + VectorFunction, + VectorKernel, + # Option classes + ArraySortOptions, + AssumeTimezoneOptions, + CastOptions, + CountOptions, + CumulativeOptions, + CumulativeSumOptions, + DayOfWeekOptions, + DictionaryEncodeOptions, + RunEndEncodeOptions, + ElementWiseAggregateOptions, + ExtractRegexOptions, + FilterOptions, + IndexOptions, + JoinOptions, + ListSliceOptions, + ListFlattenOptions, + MakeStructOptions, + MapLookupOptions, + MatchSubstringOptions, + ModeOptions, + NullOptions, + PadOptions, + PairwiseOptions, + PartitionNthOptions, + QuantileOptions, + RandomOptions, + RankOptions, + ReplaceSliceOptions, + ReplaceSubstringOptions, + RoundBinaryOptions, + RoundOptions, + RoundTemporalOptions, + RoundToMultipleOptions, + ScalarAggregateOptions, + SelectKOptions, + SetLookupOptions, + SliceOptions, + SortOptions, + SplitOptions, + SplitPatternOptions, + StrftimeOptions, + StrptimeOptions, + StructFieldOptions, + TakeOptions, + TDigestOptions, + TrimOptions, + Utf8NormalizeOptions, + VarianceOptions, + WeekOptions, + # Functions + call_function, + function_registry, + get_function, + list_functions, + # Udf + call_tabular_function, + register_scalar_function, + register_tabular_function, + register_aggregate_function, + register_vector_function, + UdfContext, + # Expressions + Expression, +) + +from collections import namedtuple +import inspect +from textwrap import dedent +import warnings + +import pyarrow as pa +from pyarrow import _compute_docstrings +from pyarrow.vendored import docscrape + + +def _get_arg_names(func): + return func._doc.arg_names + + +_OptionsClassDoc = namedtuple('_OptionsClassDoc', ('params',)) + + +def _scrape_options_class_doc(options_class): + if not options_class.__doc__: + return None + doc = docscrape.NumpyDocString(options_class.__doc__) + return _OptionsClassDoc(doc['Parameters']) + + +def _decorate_compute_function(wrapper, exposed_name, func, options_class): + # Decorate the given compute function wrapper with useful metadata + # and documentation. + cpp_doc = func._doc + + wrapper.__arrow_compute_function__ = dict( + name=func.name, + arity=func.arity, + options_class=cpp_doc.options_class, + options_required=cpp_doc.options_required) + wrapper.__name__ = exposed_name + wrapper.__qualname__ = exposed_name + + doc_pieces = [] + + # 1. One-line summary + summary = cpp_doc.summary + if not summary: + arg_str = "arguments" if func.arity > 1 else "argument" + summary = ("Call compute function {!r} with the given {}" + .format(func.name, arg_str)) + + doc_pieces.append(f"{summary}.\n\n") + + # 2. Multi-line description + description = cpp_doc.description + if description: + doc_pieces.append(f"{description}\n\n") + + doc_addition = _compute_docstrings.function_doc_additions.get(func.name) + + # 3. Parameter description + doc_pieces.append(dedent("""\ + Parameters + ---------- + """)) + + # 3a. Compute function parameters + arg_names = _get_arg_names(func) + for arg_name in arg_names: + if func.kind in ('vector', 'scalar_aggregate'): + arg_type = 'Array-like' + else: + arg_type = 'Array-like or scalar-like' + doc_pieces.append(f"{arg_name} : {arg_type}\n") + doc_pieces.append(" Argument to compute function.\n") + + # 3b. Compute function option values + if options_class is not None: + options_class_doc = _scrape_options_class_doc(options_class) + if options_class_doc: + for p in options_class_doc.params: + doc_pieces.append(f"{p.name} : {p.type}\n") + for s in p.desc: + doc_pieces.append(f" {s}\n") + else: + warnings.warn(f"Options class {options_class.__name__} " + f"does not have a docstring", RuntimeWarning) + options_sig = inspect.signature(options_class) + for p in options_sig.parameters.values(): + doc_pieces.append(dedent("""\ + {0} : optional + Parameter for {1} constructor. Either `options` + or `{0}` can be passed, but not both at the same time. + """.format(p.name, options_class.__name__))) + doc_pieces.append(dedent(f"""\ + options : pyarrow.compute.{options_class.__name__}, optional + Alternative way of passing options. + """)) + + doc_pieces.append(dedent("""\ + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """)) + + # 4. Custom addition (e.g. examples) + if doc_addition is not None: + doc_pieces.append("\n{}\n".format(dedent(doc_addition).strip("\n"))) + + wrapper.__doc__ = "".join(doc_pieces) + return wrapper + + +def _get_options_class(func): + class_name = func._doc.options_class + if not class_name: + return None + try: + return globals()[class_name] + except KeyError: + warnings.warn("Python binding for {} not exposed" + .format(class_name), RuntimeWarning) + return None + + +def _handle_options(name, options_class, options, args, kwargs): + if args or kwargs: + if options is not None: + raise TypeError( + "Function {!r} called with both an 'options' argument " + "and additional arguments" + .format(name)) + return options_class(*args, **kwargs) + + if options is not None: + if isinstance(options, dict): + return options_class(**options) + elif isinstance(options, options_class): + return options + raise TypeError( + "Function {!r} expected a {} parameter, got {}" + .format(name, options_class, type(options))) + + return None + + +def _make_generic_wrapper(func_name, func, options_class, arity): + if options_class is None: + def wrapper(*args, memory_pool=None): + if arity is not Ellipsis and len(args) != arity: + raise TypeError( + f"{func_name} takes {arity} positional argument(s), " + f"but {len(args)} were given" + ) + if args and isinstance(args[0], Expression): + return Expression._call(func_name, list(args)) + return func.call(args, None, memory_pool) + else: + def wrapper(*args, memory_pool=None, options=None, **kwargs): + if arity is not Ellipsis: + if len(args) < arity: + raise TypeError( + f"{func_name} takes {arity} positional argument(s), " + f"but {len(args)} were given" + ) + option_args = args[arity:] + args = args[:arity] + else: + option_args = () + options = _handle_options(func_name, options_class, options, + option_args, kwargs) + if args and isinstance(args[0], Expression): + return Expression._call(func_name, list(args), options) + return func.call(args, options, memory_pool) + return wrapper + + +def _make_signature(arg_names, var_arg_names, options_class): + from inspect import Parameter + params = [] + for name in arg_names: + params.append(Parameter(name, Parameter.POSITIONAL_ONLY)) + for name in var_arg_names: + params.append(Parameter(name, Parameter.VAR_POSITIONAL)) + if options_class is not None: + options_sig = inspect.signature(options_class) + for p in options_sig.parameters.values(): + assert p.kind in (Parameter.POSITIONAL_OR_KEYWORD, + Parameter.KEYWORD_ONLY) + if var_arg_names: + # Cannot have a positional argument after a *args + p = p.replace(kind=Parameter.KEYWORD_ONLY) + params.append(p) + params.append(Parameter("options", Parameter.KEYWORD_ONLY, + default=None)) + params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY, + default=None)) + return inspect.Signature(params) + + +def _wrap_function(name, func): + options_class = _get_options_class(func) + arg_names = _get_arg_names(func) + has_vararg = arg_names and arg_names[-1].startswith('*') + if has_vararg: + var_arg_names = [arg_names.pop().lstrip('*')] + else: + var_arg_names = [] + + wrapper = _make_generic_wrapper( + name, func, options_class, arity=func.arity) + wrapper.__signature__ = _make_signature(arg_names, var_arg_names, + options_class) + return _decorate_compute_function(wrapper, name, func, options_class) + + +def _make_global_functions(): + """ + Make global functions wrapping each compute function. + + Note that some of the automatically-generated wrappers may be overridden + by custom versions below. + """ + g = globals() + reg = function_registry() + + # Avoid clashes with Python keywords + rewrites = {'and': 'and_', + 'or': 'or_'} + + for cpp_name in reg.list_functions(): + name = rewrites.get(cpp_name, cpp_name) + func = reg.get_function(cpp_name) + if func.kind == "hash_aggregate": + # Hash aggregate functions are not callable, + # so let's not expose them at module level. + continue + if func.kind == "scalar_aggregate" and func.arity == 0: + # Nullary scalar aggregate functions are not callable + # directly so let's not expose them at module level. + continue + assert name not in g, name + g[cpp_name] = g[name] = _wrap_function(name, func) + + +_make_global_functions() + + +def cast(arr, target_type=None, safe=None, options=None, memory_pool=None): + """ + Cast array values to another data type. Can also be invoked as an array + instance method. + + Parameters + ---------- + arr : Array-like + target_type : DataType or str + Type to cast to + safe : bool, default True + Check for overflows or other unsafe conversions + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Examples + -------- + >>> from datetime import datetime + >>> import pyarrow as pa + >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) + >>> arr.type + TimestampType(timestamp[us]) + + You can use ``pyarrow.DataType`` objects to specify the target type: + + >>> cast(arr, pa.timestamp('ms')) + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + + >>> cast(arr, pa.timestamp('ms')).type + TimestampType(timestamp[ms]) + + Alternatively, it is also supported to use the string aliases for these + types: + + >>> arr.cast('timestamp[ms]') + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + >>> arr.cast('timestamp[ms]').type + TimestampType(timestamp[ms]) + + Returns + ------- + casted : Array + The cast result as a new Array + """ + safe_vars_passed = (safe is not None) or (target_type is not None) + + if safe_vars_passed and (options is not None): + raise ValueError("Must either pass values for 'target_type' and 'safe'" + " or pass a value for 'options'") + + if options is None: + target_type = pa.types.lib.ensure_type(target_type) + if safe is False: + options = CastOptions.unsafe(target_type) + else: + options = CastOptions.safe(target_type) + return call_function("cast", [arr], options, memory_pool) + + +def index(data, value, start=None, end=None, *, memory_pool=None): + """ + Find the index of the first occurrence of a given value. + + Parameters + ---------- + data : Array-like + value : Scalar-like object + The value to search for. + start : int, optional + end : int, optional + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + index : int + the index, or -1 if not found + """ + if start is not None: + if end is not None: + data = data.slice(start, end - start) + else: + data = data.slice(start) + elif end is not None: + data = data.slice(0, end) + + if not isinstance(value, pa.Scalar): + value = pa.scalar(value, type=data.type) + elif data.type != value.type: + value = pa.scalar(value.as_py(), type=data.type) + options = IndexOptions(value=value) + result = call_function('index', [data], options, memory_pool) + if start is not None and result.as_py() >= 0: + result = pa.scalar(result.as_py() + start, type=pa.int64()) + return result + + +def take(data, indices, *, boundscheck=True, memory_pool=None): + """ + Select values (or records) from array- or table-like data given integer + selection indices. + + The result will be of the same type(s) as the input, with elements taken + from the input array (or record batch / table fields) at the given + indices. If an index is null then the corresponding value in the output + will be null. + + Parameters + ---------- + data : Array, ChunkedArray, RecordBatch, or Table + indices : Array, ChunkedArray + Must be of integer type + boundscheck : boolean, default True + Whether to boundscheck the indices. If False and there is an out of + bounds index, will likely cause the process to crash. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : depends on inputs + Selected values for the given indices + + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> indices = pa.array([0, None, 4, 3]) + >>> arr.take(indices) + + [ + "a", + null, + "e", + null + ] + """ + options = TakeOptions(boundscheck=boundscheck) + return call_function('take', [data, indices], options, memory_pool) + + +def fill_null(values, fill_value): + """Replace each null element in values with a corresponding + element from fill_value. + + If fill_value is scalar-like, then every null element in values + will be replaced with fill_value. If fill_value is array-like, + then the i-th element in values will be replaced with the i-th + element in fill_value. + + The fill_value's type must be the same as that of values, or it + must be able to be implicitly casted to the array's type. + + This is an alias for :func:`coalesce`. + + Parameters + ---------- + values : Array, ChunkedArray, or Scalar-like object + Each null element is replaced with the corresponding value + from fill_value. + fill_value : Array, ChunkedArray, or Scalar-like object + If not same type as values, will attempt to cast. + + Returns + ------- + result : depends on inputs + Values with all null elements replaced + + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array([1, 2, None, 3], type=pa.int8()) + >>> fill_value = pa.scalar(5, type=pa.int8()) + >>> arr.fill_null(fill_value) + + [ + 1, + 2, + 5, + 3 + ] + >>> arr = pa.array([1, 2, None, 4, None]) + >>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) + + [ + 1, + 2, + 30, + 4, + 50 + ] + """ + if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)): + fill_value = pa.scalar(fill_value, type=values.type) + elif values.type != fill_value.type: + fill_value = pa.scalar(fill_value.as_py(), type=values.type) + + return call_function("coalesce", [values, fill_value]) + + +def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None): + """ + Select the indices of the top-k ordered elements from array- or table-like + data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get top indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array + Indices of the top-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.top_k_unstable(arr, k=3) + + [ + 5, + 4, + 2 + ] + """ + if sort_keys is None: + sort_keys = [] + if isinstance(values, (pa.Array, pa.ChunkedArray)): + sort_keys.append(("dummy", "descending")) + else: + sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys) + options = SelectKOptions(k, sort_keys) + return call_function("select_k_unstable", [values], options, memory_pool) + + +def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None): + """ + Select the indices of the bottom-k ordered elements from + array- or table-like data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get bottom indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + Indices of the bottom-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.bottom_k_unstable(arr, k=3) + + [ + 0, + 1, + 2 + ] + """ + if sort_keys is None: + sort_keys = [] + if isinstance(values, (pa.Array, pa.ChunkedArray)): + sort_keys.append(("dummy", "ascending")) + else: + sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys) + options = SelectKOptions(k, sort_keys) + return call_function("select_k_unstable", [values], options, memory_pool) + + +def random(n, *, initializer='system', options=None, memory_pool=None): + """ + Generate numbers in the range [0, 1). + + Generated values are uniformly-distributed, double-precision + in range [0, 1). Algorithm and seed can be changed via RandomOptions. + + Parameters + ---------- + n : int + Number of values to generate, must be greater than or equal to 0 + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + options : pyarrow.compute.RandomOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + options = RandomOptions(initializer=initializer) + return call_function("random", [], options, memory_pool, length=n) + + +def field(*name_or_index): + """Reference a column of the dataset. + + Stores only the field's name. Type and other information is known only when + the expression is bound to a dataset having an explicit scheme. + + Nested references are allowed by passing multiple names or a tuple of + names. For example ``('foo', 'bar')`` references the field named "bar" + inside the field named "foo". + + Parameters + ---------- + *name_or_index : string, multiple strings, tuple or int + The name or index of the (possibly nested) field the expression + references to. + + Returns + ------- + field_expr : Expression + Reference to the given field + + Examples + -------- + >>> import pyarrow.compute as pc + >>> pc.field("a") + + >>> pc.field(1) + + >>> pc.field(("a", "b")) + >> pc.field("a", "b") + tobytes(path) + + check_status(Initialize(options)) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/conftest.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..10a2e72f923cba3f7597a566830ffc55370f521c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/conftest.py @@ -0,0 +1,386 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +import os +import pyarrow as pa +from pyarrow import Codec +from pyarrow import fs +from pyarrow.lib import is_threading_enabled +from pyarrow.tests.util import windows_has_tzdata +import sys + + +groups = [ + 'acero', + 'azure', + 'brotli', + 'bz2', + 'cython', + 'dataset', + 'hypothesis', + 'fastparquet', + 'flight', + 'gandiva', + 'gcs', + 'gdb', + 'gzip', + 'hdfs', + 'large_memory', + 'lz4', + 'memory_leak', + 'nopandas', + 'nonumpy', + 'numpy', + 'orc', + 'pandas', + 'parquet', + 'parquet_encryption', + 'processes', + 'requires_testing_data', + 's3', + 'slow', + 'snappy', + 'sockets', + 'substrait', + 'threading', + 'timezone_data', + 'zstd', +] + +defaults = { + 'acero': False, + 'azure': False, + 'brotli': Codec.is_available('brotli'), + 'bz2': Codec.is_available('bz2'), + 'cython': False, + 'dataset': False, + 'fastparquet': False, + 'flight': False, + 'gandiva': False, + 'gcs': False, + 'gdb': True, + 'gzip': Codec.is_available('gzip'), + 'hdfs': False, + 'hypothesis': False, + 'large_memory': False, + 'lz4': Codec.is_available('lz4'), + 'memory_leak': False, + 'nopandas': False, + 'nonumpy': False, + 'numpy': False, + 'orc': False, + 'pandas': False, + 'parquet': False, + 'parquet_encryption': False, + 'processes': True, + 'requires_testing_data': True, + 's3': False, + 'slow': False, + 'snappy': Codec.is_available('snappy'), + 'sockets': True, + 'substrait': False, + 'threading': is_threading_enabled(), + 'timezone_data': True, + 'zstd': Codec.is_available('zstd'), +} + +if sys.platform == "emscripten": + # Emscripten doesn't support subprocess, + # multiprocessing, gdb or socket based + # networking + defaults['gdb'] = False + defaults['processes'] = False + defaults['sockets'] = False + +if sys.platform == "win32": + defaults['timezone_data'] = windows_has_tzdata() +elif sys.platform == "emscripten": + defaults['timezone_data'] = os.path.exists("/usr/share/zoneinfo") + +try: + import cython # noqa + defaults['cython'] = True +except ImportError: + pass + +try: + import fastparquet # noqa + defaults['fastparquet'] = True +except ImportError: + pass + +try: + import pyarrow.gandiva # noqa + defaults['gandiva'] = True +except ImportError: + pass + +try: + import pyarrow.acero # noqa + defaults['acero'] = True +except ImportError: + pass + +try: + import pyarrow.dataset # noqa + defaults['dataset'] = True +except ImportError: + pass + +try: + import pyarrow.orc # noqa + if sys.platform == "win32": + defaults['orc'] = True + else: + # orc tests on non-Windows platforms only work + # if timezone data exists, so skip them if + # not. + defaults['orc'] = defaults['timezone_data'] +except ImportError: + pass + +try: + import pandas # noqa + defaults['pandas'] = True +except ImportError: + defaults['nopandas'] = True + +try: + import numpy # noqa + defaults['numpy'] = True +except ImportError: + defaults['nonumpy'] = True + +try: + import pyarrow.parquet # noqa + defaults['parquet'] = True +except ImportError: + pass + +try: + import pyarrow.parquet.encryption # noqa + defaults['parquet_encryption'] = True +except ImportError: + pass + +try: + import pyarrow.flight # noqa + defaults['flight'] = True +except ImportError: + pass + +try: + from pyarrow.fs import AzureFileSystem # noqa + defaults['azure'] = True +except ImportError: + pass + +try: + from pyarrow.fs import GcsFileSystem # noqa + defaults['gcs'] = True +except ImportError: + pass + +try: + from pyarrow.fs import S3FileSystem # noqa + defaults['s3'] = True +except ImportError: + pass + +try: + from pyarrow.fs import HadoopFileSystem # noqa + defaults['hdfs'] = True +except ImportError: + pass + +try: + import pyarrow.substrait # noqa + defaults['substrait'] = True +except ImportError: + pass + + +# Doctest should ignore files for the modules that are not built +def pytest_ignore_collect(path, config): + if config.option.doctestmodules: + # don't try to run doctests on the /tests directory + if "/pyarrow/tests/" in str(path): + return True + + doctest_groups = [ + 'dataset', + 'orc', + 'parquet', + 'flight', + 'substrait', + ] + + # handle cuda, flight, etc + for group in doctest_groups: + if 'pyarrow/{}'.format(group) in str(path): + if not defaults[group]: + return True + + if 'pyarrow/parquet/encryption' in str(path): + if not defaults['parquet_encryption']: + return True + + if 'pyarrow/cuda' in str(path): + try: + import pyarrow.cuda # noqa + return False + except ImportError: + return True + + if 'pyarrow/fs' in str(path): + try: + from pyarrow.fs import S3FileSystem # noqa + return False + except ImportError: + return True + + if getattr(config.option, "doctest_cython", False): + if "/pyarrow/tests/" in str(path): + return True + if "/pyarrow/_parquet_encryption" in str(path): + return True + + return False + + +# Save output files from doctest examples into temp dir +@pytest.fixture(autouse=True) +def _docdir(request): + + # Trigger ONLY for the doctests + doctest_m = request.config.option.doctestmodules + doctest_c = getattr(request.config.option, "doctest_cython", False) + + if doctest_m or doctest_c: + + # Get the fixture dynamically by its name. + tmpdir = request.getfixturevalue('tmpdir') + + # Chdir only for the duration of the test. + with tmpdir.as_cwd(): + yield + + else: + yield + + +# Define doctest_namespace for fs module docstring import +@pytest.fixture(autouse=True) +def add_fs(doctest_namespace, request, tmp_path): + + # Trigger ONLY for the doctests + doctest_m = request.config.option.doctestmodules + doctest_c = getattr(request.config.option, "doctest_cython", False) + + if doctest_m or doctest_c: + # fs import + doctest_namespace["fs"] = fs + + # Creation of an object and file with data + local = fs.LocalFileSystem() + path = tmp_path / 'pyarrow-fs-example.dat' + with local.open_output_stream(str(path)) as stream: + stream.write(b'data') + doctest_namespace["local"] = local + doctest_namespace["local_path"] = str(tmp_path) + doctest_namespace["path"] = str(path) + yield + + +# Define udf fixture for test_udf.py and test_substrait.py +@pytest.fixture(scope="session") +def unary_func_fixture(): + """ + Register a unary scalar function. + """ + from pyarrow import compute as pc + + def unary_function(ctx, x): + return pc.call_function("add", [x, 1], + memory_pool=ctx.memory_pool) + func_name = "y=x+1" + unary_doc = {"summary": "add function", + "description": "test add function"} + pc.register_scalar_function(unary_function, + func_name, + unary_doc, + {"array": pa.int64()}, + pa.int64()) + return unary_function, func_name + + +@pytest.fixture(scope="session") +def unary_agg_func_fixture(): + """ + Register a unary aggregate function (mean) + """ + from pyarrow import compute as pc + import numpy as np + + def func(ctx, x): + return pa.scalar(np.nanmean(x)) + + func_name = "mean_udf" + func_doc = {"summary": "y=avg(x)", + "description": "find mean of x"} + + pc.register_aggregate_function(func, + func_name, + func_doc, + { + "x": pa.float64(), + }, + pa.float64() + ) + return func, func_name + + +@pytest.fixture(scope="session") +def varargs_agg_func_fixture(): + """ + Register a unary aggregate function + """ + from pyarrow import compute as pc + import numpy as np + + def func(ctx, *args): + sum = 0.0 + for arg in args: + sum += np.nanmean(arg) + return pa.scalar(sum) + + func_name = "sum_mean" + func_doc = {"summary": "Varargs aggregate", + "description": "Varargs aggregate"} + + pc.register_aggregate_function(func, + func_name, + func_doc, + { + "x": pa.int64(), + "y": pa.float64() + }, + pa.float64() + ) + return func, func_name diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/csv.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/csv.py new file mode 100644 index 0000000000000000000000000000000000000000..1ae197f9f200f44d8a8a65851a89025f61c4d842 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/csv.py @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from pyarrow._csv import ( # noqa + ReadOptions, ParseOptions, ConvertOptions, ISO8601, + open_csv, read_csv, CSVStreamingReader, write_csv, + WriteOptions, CSVWriter, InvalidRow) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/cuda.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/cuda.py new file mode 100644 index 0000000000000000000000000000000000000000..18c530d4afe406366b6ff7c12cbc1c6813081e04 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/cuda.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + + +from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, + HostBuffer, BufferReader, BufferWriter, + new_host_buffer, + serialize_record_batch, read_message, + read_record_batch) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/dataset.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..1efbfe1665a759618a371bbf326780beb8654ef7 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/dataset.py @@ -0,0 +1,1035 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Dataset is currently unstable. APIs subject to change without notice.""" + +import pyarrow as pa +from pyarrow.util import _is_iterable, _stringify_path, _is_path_like + +try: + from pyarrow._dataset import ( # noqa + CsvFileFormat, + CsvFragmentScanOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FilenamePartitioning, + FileFormat, + FileFragment, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + IpcFileFormat, + IpcFileWriteOptions, + InMemoryDataset, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, + get_partition_keys as _get_partition_keys, # keep for backwards compatibility + _filesystemdataset_write, + ) +except ImportError as exc: + raise ImportError( + f"The pyarrow installation is not built with support for 'dataset' ({str(exc)})" + ) from None + +# keep Expression functionality exposed here for backwards compatibility +from pyarrow.compute import Expression, scalar, field # noqa + + +_orc_available = False +_orc_msg = ( + "The pyarrow installation is not built with support for the ORC file " + "format." +) + +try: + from pyarrow._dataset_orc import OrcFileFormat + _orc_available = True +except ImportError: + pass + +_parquet_available = False +_parquet_msg = ( + "The pyarrow installation is not built with support for the Parquet file " + "format." +) + +try: + from pyarrow._dataset_parquet import ( # noqa + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, + ) + _parquet_available = True +except ImportError: + pass + + +try: + from pyarrow._dataset_parquet_encryption import ( # noqa + ParquetDecryptionConfig, + ParquetEncryptionConfig, + ) +except ImportError: + pass + + +def __getattr__(name): + if name == "OrcFileFormat" and not _orc_available: + raise ImportError(_orc_msg) + + if name == "ParquetFileFormat" and not _parquet_available: + raise ImportError(_parquet_msg) + + raise AttributeError( + "module 'pyarrow.dataset' has no attribute '{0}'".format(name) + ) + + +def partitioning(schema=None, field_names=None, flavor=None, + dictionaries=None): + """ + Specify a partitioning scheme. + + The supported schemes include: + + - "DirectoryPartitioning": this scheme expects one segment in the file path + for each field in the specified schema (all fields are required to be + present). For example given schema the path + "/2009/11" would be parsed to ("year"_ == 2009 and "month"_ == 11). + - "HivePartitioning": a scheme for "/$key=$value/" nested directories as + found in Apache Hive. This is a multi-level, directory based partitioning + scheme. Data is partitioned by static values of a particular column in + the schema. Partition keys are represented in the form $key=$value in + directory names. Field order is ignored, as are missing or unrecognized + field names. + For example, given schema, a possible + path would be "/year=2009/month=11/day=15" (but the field order does not + need to match). + - "FilenamePartitioning": this scheme expects the partitions will have + filenames containing the field values separated by "_". + For example, given schema, a possible + partition filename "2009_11_part-0.parquet" would be parsed + to ("year"_ == 2009 and "month"_ == 11). + + Parameters + ---------- + schema : pyarrow.Schema, default None + The schema that describes the partitions present in the file path. + If not specified, and `field_names` and/or `flavor` are specified, + the schema will be inferred from the file path (and a + PartitioningFactory is returned). + field_names : list of str, default None + A list of strings (field names). If specified, the schema's types are + inferred from the file paths (only valid for DirectoryPartitioning). + flavor : str, default None + The default is DirectoryPartitioning. Specify ``flavor="hive"`` for + a HivePartitioning, and ``flavor="filename"`` for a + FilenamePartitioning. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. Alternatively, pass `infer` to have + Arrow discover the dictionary values, in which case a + PartitioningFactory is returned. + + Returns + ------- + Partitioning or PartitioningFactory + The partitioning scheme + + Examples + -------- + + Specify the Schema for paths like "/2009/June": + + >>> import pyarrow as pa + >>> import pyarrow.dataset as ds + >>> part = ds.partitioning(pa.schema([("year", pa.int16()), + ... ("month", pa.string())])) + + or let the types be inferred by only specifying the field names: + + >>> part = ds.partitioning(field_names=["year", "month"]) + + For paths like "/2009/June", the year will be inferred as int32 while month + will be inferred as string. + + Specify a Schema with dictionary encoding, providing dictionary values: + + >>> part = ds.partitioning( + ... pa.schema([ + ... ("year", pa.int16()), + ... ("month", pa.dictionary(pa.int8(), pa.string())) + ... ]), + ... dictionaries={ + ... "month": pa.array(["January", "February", "March"]), + ... }) + + Alternatively, specify a Schema with dictionary encoding, but have Arrow + infer the dictionary values: + + >>> part = ds.partitioning( + ... pa.schema([ + ... ("year", pa.int16()), + ... ("month", pa.dictionary(pa.int8(), pa.string())) + ... ]), + ... dictionaries="infer") + + Create a Hive scheme for a path like "/year=2009/month=11": + + >>> part = ds.partitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]), + ... flavor="hive") + + A Hive scheme can also be discovered from the directory structure (and + types will be inferred): + + >>> part = ds.partitioning(flavor="hive") + """ + if flavor is None: + # default flavor + if schema is not None: + if field_names is not None: + raise ValueError( + "Cannot specify both 'schema' and 'field_names'") + if dictionaries == 'infer': + return DirectoryPartitioning.discover(schema=schema) + return DirectoryPartitioning(schema, dictionaries) + elif field_names is not None: + if isinstance(field_names, list): + return DirectoryPartitioning.discover(field_names) + else: + raise ValueError( + "Expected list of field names, got {}".format( + type(field_names))) + else: + raise ValueError( + "For the default directory flavor, need to specify " + "a Schema or a list of field names") + if flavor == "filename": + if schema is not None: + if field_names is not None: + raise ValueError( + "Cannot specify both 'schema' and 'field_names'") + if dictionaries == 'infer': + return FilenamePartitioning.discover(schema=schema) + return FilenamePartitioning(schema, dictionaries) + elif field_names is not None: + if isinstance(field_names, list): + return FilenamePartitioning.discover(field_names) + else: + raise ValueError( + "Expected list of field names, got {}".format( + type(field_names))) + else: + raise ValueError( + "For the filename flavor, need to specify " + "a Schema or a list of field names") + elif flavor == 'hive': + if field_names is not None: + raise ValueError("Cannot specify 'field_names' for flavor 'hive'") + elif schema is not None: + if isinstance(schema, pa.Schema): + if dictionaries == 'infer': + return HivePartitioning.discover(schema=schema) + return HivePartitioning(schema, dictionaries) + else: + raise ValueError( + "Expected Schema for 'schema', got {}".format( + type(schema))) + else: + return HivePartitioning.discover() + else: + raise ValueError("Unsupported flavor") + + +def _ensure_partitioning(scheme): + """ + Validate input and return a Partitioning(Factory). + + It passes None through if no partitioning scheme is defined. + """ + if scheme is None: + pass + elif isinstance(scheme, str): + scheme = partitioning(flavor=scheme) + elif isinstance(scheme, list): + scheme = partitioning(field_names=scheme) + elif isinstance(scheme, (Partitioning, PartitioningFactory)): + pass + else: + raise ValueError("Expected Partitioning or PartitioningFactory, got {}" + .format(type(scheme))) + return scheme + + +def _ensure_format(obj): + if isinstance(obj, FileFormat): + return obj + elif obj == "parquet": + if not _parquet_available: + raise ValueError(_parquet_msg) + return ParquetFileFormat() + elif obj in {"ipc", "arrow"}: + return IpcFileFormat() + elif obj == "feather": + return FeatherFileFormat() + elif obj == "csv": + return CsvFileFormat() + elif obj == "orc": + if not _orc_available: + raise ValueError(_orc_msg) + return OrcFileFormat() + elif obj == "json": + return JsonFileFormat() + else: + raise ValueError("format '{}' is not supported".format(obj)) + + +def _ensure_multiple_sources(paths, filesystem=None): + """ + Treat a list of paths as files belonging to a single file system + + If the file system is local then also validates that all paths + are referencing existing *files* otherwise any non-file paths will be + silently skipped (for example on a remote filesystem). + + Parameters + ---------- + paths : list of path-like + Note that URIs are not allowed. + filesystem : FileSystem or str, optional + If an URI is passed, then its path component will act as a prefix for + the file paths. + + Returns + ------- + (FileSystem, list of str) + File system object and a list of normalized paths. + + Raises + ------ + TypeError + If the passed filesystem has wrong type. + IOError + If the file system is local and a referenced path is not available or + not a file. + """ + from pyarrow.fs import ( + LocalFileSystem, SubTreeFileSystem, _MockFileSystem, FileType, + _ensure_filesystem + ) + + if filesystem is None: + # fall back to local file system as the default + filesystem = LocalFileSystem() + else: + # construct a filesystem if it is a valid URI + filesystem = _ensure_filesystem(filesystem) + + is_local = ( + isinstance(filesystem, (LocalFileSystem, _MockFileSystem)) or + (isinstance(filesystem, SubTreeFileSystem) and + isinstance(filesystem.base_fs, LocalFileSystem)) + ) + + # allow normalizing irregular paths such as Windows local paths + paths = [filesystem.normalize_path(_stringify_path(p)) for p in paths] + + # validate that all of the paths are pointing to existing *files* + # possible improvement is to group the file_infos by type and raise for + # multiple paths per error category + if is_local: + for info in filesystem.get_file_info(paths): + file_type = info.type + if file_type == FileType.File: + continue + elif file_type == FileType.NotFound: + raise FileNotFoundError(info.path) + elif file_type == FileType.Directory: + raise IsADirectoryError( + 'Path {} points to a directory, but only file paths are ' + 'supported. To construct a nested or union dataset pass ' + 'a list of dataset objects instead.'.format(info.path) + ) + else: + raise IOError( + 'Path {} exists but its type is unknown (could be a ' + 'special file such as a Unix socket or character device, ' + 'or Windows NUL / CON / ...)'.format(info.path) + ) + + return filesystem, paths + + +def _ensure_single_source(path, filesystem=None): + """ + Treat path as either a recursively traversable directory or a single file. + + Parameters + ---------- + path : path-like + filesystem : FileSystem or str, optional + If an URI is passed, then its path component will act as a prefix for + the file paths. + + Returns + ------- + (FileSystem, list of str or fs.Selector) + File system object and either a single item list pointing to a file or + an fs.Selector object pointing to a directory. + + Raises + ------ + TypeError + If the passed filesystem has wrong type. + FileNotFoundError + If the referenced file or directory doesn't exist. + """ + from pyarrow.fs import FileType, FileSelector, _resolve_filesystem_and_path + + # at this point we already checked that `path` is a path-like + filesystem, path = _resolve_filesystem_and_path(path, filesystem) + + # ensure that the path is normalized before passing to dataset discovery + path = filesystem.normalize_path(path) + + # retrieve the file descriptor + file_info = filesystem.get_file_info(path) + + # depending on the path type either return with a recursive + # directory selector or as a list containing a single file + if file_info.type == FileType.Directory: + paths_or_selector = FileSelector(path, recursive=True) + elif file_info.type == FileType.File: + paths_or_selector = [path] + else: + raise FileNotFoundError(path) + + return filesystem, paths_or_selector + + +def _filesystem_dataset(source, schema=None, filesystem=None, + partitioning=None, format=None, + partition_base_dir=None, exclude_invalid_files=None, + selector_ignore_prefixes=None): + """ + Create a FileSystemDataset which can be used to build a Dataset. + + Parameters are documented in the dataset function. + + Returns + ------- + FileSystemDataset + """ + from pyarrow.fs import LocalFileSystem, _ensure_filesystem, FileInfo + + format = _ensure_format(format or 'parquet') + partitioning = _ensure_partitioning(partitioning) + + if isinstance(source, (list, tuple)): + if source and isinstance(source[0], FileInfo): + if filesystem is None: + # fall back to local file system as the default + fs = LocalFileSystem() + else: + # construct a filesystem if it is a valid URI + fs = _ensure_filesystem(filesystem) + paths_or_selector = source + else: + fs, paths_or_selector = _ensure_multiple_sources(source, filesystem) + else: + fs, paths_or_selector = _ensure_single_source(source, filesystem) + + options = FileSystemFactoryOptions( + partitioning=partitioning, + partition_base_dir=partition_base_dir, + exclude_invalid_files=exclude_invalid_files, + selector_ignore_prefixes=selector_ignore_prefixes + ) + factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options) + + return factory.finish(schema) + + +def _in_memory_dataset(source, schema=None, **kwargs): + if any(v is not None for v in kwargs.values()): + raise ValueError( + "For in-memory datasets, you cannot pass any additional arguments") + return InMemoryDataset(source, schema) + + +def _union_dataset(children, schema=None, **kwargs): + if any(v is not None for v in kwargs.values()): + raise ValueError( + "When passing a list of Datasets, you cannot pass any additional " + "arguments" + ) + + if schema is None: + # unify the children datasets' schemas + schema = pa.unify_schemas([child.schema for child in children]) + + for child in children: + if getattr(child, "_scan_options", None): + raise ValueError( + "Creating an UnionDataset from filtered or projected Datasets " + "is currently not supported. Union the unfiltered datasets " + "and apply the filter to the resulting union." + ) + + # create datasets with the requested schema + children = [child.replace_schema(schema) for child in children] + + return UnionDataset(schema, children) + + +def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, + partitioning=None, partition_base_dir=None): + """ + Create a FileSystemDataset from a `_metadata` file created via + `pyarrow.parquet.write_metadata`. + + Parameters + ---------- + metadata_path : path, + Path pointing to a single file parquet metadata file + schema : Schema, optional + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. + filesystem : FileSystem or URI string, default None + If a single path is given as source and filesystem is None, then the + filesystem will be inferred from the path. + If an URI string is passed, then a filesystem object is constructed + using the URI's optional path component as a directory prefix. See the + examples below. + Note that the URIs on Windows must follow 'file:///C:...' or + 'file:/C:...' patterns. + format : ParquetFileFormat + An instance of a ParquetFileFormat if special options needs to be + passed. + partitioning : Partitioning, PartitioningFactory, str, list of str + The partitioning scheme specified with the ``partitioning()`` + function. A flavor string can be used as shortcut, and with a list of + field names a DirectoryPartitioning will be inferred. + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + + Returns + ------- + FileSystemDataset + The dataset corresponding to the given metadata + """ + from pyarrow.fs import LocalFileSystem, _ensure_filesystem + + if format is None: + format = ParquetFileFormat() + elif not isinstance(format, ParquetFileFormat): + raise ValueError("format argument must be a ParquetFileFormat") + + if filesystem is None: + filesystem = LocalFileSystem() + else: + filesystem = _ensure_filesystem(filesystem) + + metadata_path = filesystem.normalize_path(_stringify_path(metadata_path)) + options = ParquetFactoryOptions( + partition_base_dir=partition_base_dir, + partitioning=_ensure_partitioning(partitioning) + ) + + factory = ParquetDatasetFactory( + metadata_path, filesystem, format, options=options) + return factory.finish(schema) + + +def dataset(source, schema=None, format=None, filesystem=None, + partitioning=None, partition_base_dir=None, + exclude_invalid_files=None, ignore_prefixes=None): + """ + Open a dataset. + + Datasets provides functionality to efficiently work with tabular, + potentially larger than memory and multi-file dataset. + + - A unified interface for different sources, like Parquet and Feather + - Discovery of sources (crawling directories, handle directory-based + partitioned datasets, basic schema normalization) + - Optimized reading with predicate pushdown (filtering rows), projection + (selecting columns), parallel reading or fine-grained managing of tasks. + + Note that this is the high-level API, to have more control over the dataset + construction use the low-level API classes (FileSystemDataset, + FilesystemDatasetFactory, etc.) + + Parameters + ---------- + source : path, list of paths, dataset, list of datasets, (list of) \ +RecordBatch or Table, iterable of RecordBatch, RecordBatchReader, or URI + Path pointing to a single file: + Open a FileSystemDataset from a single file. + Path pointing to a directory: + The directory gets discovered recursively according to a + partitioning scheme if given. + List of file paths: + Create a FileSystemDataset from explicitly given files. The files + must be located on the same filesystem given by the filesystem + parameter. + Note that in contrary of construction from a single file, passing + URIs as paths is not allowed. + List of datasets: + A nested UnionDataset gets constructed, it allows arbitrary + composition of other datasets. + Note that additional keyword arguments are not allowed. + (List of) batches or tables, iterable of batches, or RecordBatchReader: + Create an InMemoryDataset. If an iterable or empty list is given, + a schema must also be given. If an iterable or RecordBatchReader + is given, the resulting dataset can only be scanned once; further + attempts will raise an error. + schema : Schema, optional + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. + format : FileFormat or str + Currently "parquet", "ipc"/"arrow"/"feather", "csv", "json", and "orc" are + supported. For Feather, only version 2 files are supported. + filesystem : FileSystem or URI string, default None + If a single path is given as source and filesystem is None, then the + filesystem will be inferred from the path. + If an URI string is passed, then a filesystem object is constructed + using the URI's optional path component as a directory prefix. See the + examples below. + Note that the URIs on Windows must follow 'file:///C:...' or + 'file:/C:...' patterns. + partitioning : Partitioning, PartitioningFactory, str, list of str + The partitioning scheme specified with the ``partitioning()`` + function. A flavor string can be used as shortcut, and with a list of + field names a DirectoryPartitioning will be inferred. + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + exclude_invalid_files : bool, optional (default True) + If True, invalid files will be excluded (file format specific check). + This will incur IO for each files in a serial and single threaded + fashion. Disabling this feature will skip the IO, but unsupported + files may be present in the Dataset (resulting in an error at scan + time). + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + + Returns + ------- + dataset : Dataset + Either a FileSystemDataset or a UnionDataset depending on the source + parameter. + + Examples + -------- + Creating an example Table: + + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], + ... 'n_legs': [2, 2, 4, 4, 5, 100], + ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", + ... "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "file.parquet") + + Opening a single file: + + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset("file.parquet", format="parquet") + >>> dataset.to_table() + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2020,2022,2021,2022,2019,2021]] + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + + Opening a single file with an explicit schema: + + >>> myschema = pa.schema([ + ... ('n_legs', pa.int64()), + ... ('animal', pa.string())]) + >>> dataset = ds.dataset("file.parquet", schema=myschema, format="parquet") + >>> dataset.to_table() + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + + Opening a dataset for a single directory: + + >>> ds.write_dataset(table, "partitioned_dataset", format="parquet", + ... partitioning=['year']) + >>> dataset = ds.dataset("partitioned_dataset", format="parquet") + >>> dataset.to_table() + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [["Brittle stars"],["Flamingo"],...["Parrot","Horse"]] + + For a single directory from a S3 bucket: + + >>> ds.dataset("s3://mybucket/nyc-taxi/", + ... format="parquet") # doctest: +SKIP + + Opening a dataset from a list of relatives local paths: + + >>> dataset = ds.dataset([ + ... "partitioned_dataset/2019/part-0.parquet", + ... "partitioned_dataset/2020/part-0.parquet", + ... "partitioned_dataset/2021/part-0.parquet", + ... ], format='parquet') + >>> dataset.to_table() + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[5],[2],[4,100]] + animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"]] + + With filesystem provided: + + >>> paths = [ + ... 'part0/data.parquet', + ... 'part1/data.parquet', + ... 'part3/data.parquet', + ... ] + >>> ds.dataset(paths, filesystem='file:///directory/prefix, + ... format='parquet') # doctest: +SKIP + + Which is equivalent with: + + >>> fs = SubTreeFileSystem("/directory/prefix", + ... LocalFileSystem()) # doctest: +SKIP + >>> ds.dataset(paths, filesystem=fs, format='parquet') # doctest: +SKIP + + With a remote filesystem URI: + + >>> paths = [ + ... 'nested/directory/part0/data.parquet', + ... 'nested/directory/part1/data.parquet', + ... 'nested/directory/part3/data.parquet', + ... ] + >>> ds.dataset(paths, filesystem='s3://bucket/', + ... format='parquet') # doctest: +SKIP + + Similarly to the local example, the directory prefix may be included in the + filesystem URI: + + >>> ds.dataset(paths, filesystem='s3://bucket/nested/directory', + ... format='parquet') # doctest: +SKIP + + Construction of a nested dataset: + + >>> ds.dataset([ + ... dataset("s3://old-taxi-data", format="parquet"), + ... dataset("local/path/to/data", format="ipc") + ... ]) # doctest: +SKIP + """ + from pyarrow.fs import FileInfo + # collect the keyword arguments for later reuse + kwargs = dict( + schema=schema, + filesystem=filesystem, + partitioning=partitioning, + format=format, + partition_base_dir=partition_base_dir, + exclude_invalid_files=exclude_invalid_files, + selector_ignore_prefixes=ignore_prefixes + ) + + if _is_path_like(source): + return _filesystem_dataset(source, **kwargs) + elif isinstance(source, (tuple, list)): + if all(_is_path_like(elem) or isinstance(elem, FileInfo) for elem in source): + return _filesystem_dataset(source, **kwargs) + elif all(isinstance(elem, Dataset) for elem in source): + return _union_dataset(source, **kwargs) + elif all(isinstance(elem, (pa.RecordBatch, pa.Table)) + for elem in source): + return _in_memory_dataset(source, **kwargs) + else: + unique_types = set(type(elem).__name__ for elem in source) + type_names = ', '.join('{}'.format(t) for t in unique_types) + raise TypeError( + 'Expected a list of path-like or dataset objects, or a list ' + 'of batches or tables. The given list contains the following ' + 'types: {}'.format(type_names) + ) + elif isinstance(source, (pa.RecordBatch, pa.Table)): + return _in_memory_dataset(source, **kwargs) + else: + raise TypeError( + 'Expected a path-like, list of path-likes or a list of Datasets ' + 'instead of the given type: {}'.format(type(source).__name__) + ) + + +def _ensure_write_partitioning(part, schema, flavor): + if isinstance(part, PartitioningFactory): + raise ValueError("A PartitioningFactory cannot be used. " + "Did you call the partitioning function " + "without supplying a schema?") + + if isinstance(part, Partitioning) and flavor: + raise ValueError( + "Providing a partitioning_flavor with " + "a Partitioning object is not supported" + ) + elif isinstance(part, (tuple, list)): + # Name of fields were provided instead of a partitioning object. + # Create a partitioning factory with those field names. + part = partitioning( + schema=pa.schema([schema.field(f) for f in part]), + flavor=flavor + ) + elif part is None: + part = partitioning(pa.schema([]), flavor=flavor) + + if not isinstance(part, Partitioning): + raise ValueError( + "partitioning must be a Partitioning object or " + "a list of column names" + ) + + return part + + +def write_dataset(data, base_dir, *, basename_template=None, format=None, + partitioning=None, partitioning_flavor=None, schema=None, + filesystem=None, file_options=None, use_threads=True, + max_partitions=None, max_open_files=None, + max_rows_per_file=None, min_rows_per_group=None, + max_rows_per_group=None, file_visitor=None, + existing_data_behavior='error', create_dir=True): + """ + Write a dataset to a given format and partitioning. + + Parameters + ---------- + data : Dataset, Table/RecordBatch, RecordBatchReader, list of \ +Table/RecordBatch, or iterable of RecordBatch + The data to write. This can be a Dataset instance or + in-memory Arrow data. If an iterable is given, the schema must + also be given. + base_dir : str + The root directory where to write the dataset. + basename_template : str, optional + A template string used to generate basenames of written data files. + The token '{i}' will be replaced with an automatically incremented + integer. If not specified, it defaults to + "part-{i}." + format.default_extname + format : FileFormat or str + The format in which to write the dataset. Currently supported: + "parquet", "ipc"/"arrow"/"feather", and "csv". If a FileSystemDataset + is being written and `format` is not specified, it defaults to the + same format as the specified FileSystemDataset. When writing a + Table or RecordBatch, this keyword is required. + partitioning : Partitioning or list[str], optional + The partitioning scheme specified with the ``partitioning()`` + function or a list of field names. When providing a list of + field names, you can use ``partitioning_flavor`` to drive which + partitioning type should be used. + partitioning_flavor : str, optional + One of the partitioning flavors supported by + ``pyarrow.dataset.partitioning``. If omitted will use the + default of ``partitioning()`` which is directory partitioning. + schema : Schema, optional + filesystem : FileSystem, optional + file_options : pyarrow.dataset.FileWriteOptions, optional + FileFormat specific write options, created using the + ``FileFormat.make_write_options()`` function. + use_threads : bool, default True + Write files in parallel. If enabled, then maximum parallelism will be + used determined by the number of available CPU cores. + max_partitions : int, default 1024 + Maximum number of partitions any batch may be written into. + max_open_files : int, default 1024 + If greater than 0 then this will limit the maximum number of + files that can be left open. If an attempt is made to open + too many files then the least recently used file will be closed. + If this setting is set too low you may end up fragmenting your + data into many small files. + max_rows_per_file : int, default 0 + Maximum number of rows per file. If greater than 0 then this will + limit how many rows are placed in any single file. Otherwise there + will be no limit and one file will be created in each output + directory unless files need to be closed to respect max_open_files + min_rows_per_group : int, default 0 + Minimum number of rows per group. When the value is greater than 0, + the dataset writer will batch incoming data and only write the row + groups to the disk when sufficient rows have accumulated. + max_rows_per_group : int, default 1024 * 1024 + Maximum number of rows per group. If the value is greater than 0, + then the dataset writer may split up large incoming batches into + multiple row groups. If this value is set, then min_rows_per_group + should also be set. Otherwise it could end up with very small row + groups. + file_visitor : function + If set, this function will be called with a WrittenFile instance + for each file created during the call. This object will have both + a path attribute and a metadata attribute. + + The path attribute will be a string containing the path to + the created file. + + The metadata attribute will be the parquet metadata of the file. + This metadata will have the file path attribute set and can be used + to build a _metadata file. The metadata attribute will be None if + the format is not parquet. + + Example visitor which simple collects the filenames created:: + + visited_paths = [] + + def file_visitor(written_file): + visited_paths.append(written_file.path) + existing_data_behavior : 'error' | 'overwrite_or_ignore' | \ +'delete_matching' + Controls how the dataset will handle data that already exists in + the destination. The default behavior ('error') is to raise an error + if any data exists in the destination. + + 'overwrite_or_ignore' will ignore any existing data and will + overwrite files with the same name as an output file. Other + existing files will be ignored. This behavior, in combination + with a unique basename_template for each write, will allow for + an append workflow. + + 'delete_matching' is useful when you are writing a partitioned + dataset. The first time each partition directory is encountered + the entire directory will be deleted. This allows you to overwrite + old partitions completely. + create_dir : bool, default True + If False, directories will not be created. This can be useful for + filesystems that do not require directories. + """ + from pyarrow.fs import _resolve_filesystem_and_path + + if isinstance(data, (list, tuple)): + schema = schema or data[0].schema + data = InMemoryDataset(data, schema=schema) + elif isinstance(data, (pa.RecordBatch, pa.Table)): + schema = schema or data.schema + data = InMemoryDataset(data, schema=schema) + elif isinstance(data, pa.ipc.RecordBatchReader) or _is_iterable(data): + data = Scanner.from_batches(data, schema=schema) + schema = None + elif not isinstance(data, (Dataset, Scanner)): + raise ValueError( + "Only Dataset, Scanner, Table/RecordBatch, RecordBatchReader, " + "a list of Tables/RecordBatches, or iterable of batches are " + "supported." + ) + + if format is None and isinstance(data, FileSystemDataset): + format = data.format + else: + format = _ensure_format(format) + + if file_options is None: + file_options = format.make_write_options() + + if format != file_options.format: + raise TypeError("Supplied FileWriteOptions have format {}, " + "which doesn't match supplied FileFormat {}".format( + format, file_options)) + + if basename_template is None: + basename_template = "part-{i}." + format.default_extname + + if max_partitions is None: + max_partitions = 1024 + + if max_open_files is None: + max_open_files = 1024 + + if max_rows_per_file is None: + max_rows_per_file = 0 + + if max_rows_per_group is None: + max_rows_per_group = 1 << 20 + + if min_rows_per_group is None: + min_rows_per_group = 0 + + # at this point data is a Scanner or a Dataset, anything else + # was converted to one of those two. So we can grab the schema + # to build the partitioning object from Dataset. + if isinstance(data, Scanner): + partitioning_schema = data.projected_schema + else: + partitioning_schema = data.schema + partitioning = _ensure_write_partitioning(partitioning, + schema=partitioning_schema, + flavor=partitioning_flavor) + + filesystem, base_dir = _resolve_filesystem_and_path(base_dir, filesystem) + + if isinstance(data, Dataset): + scanner = data.scanner(use_threads=use_threads) + else: + # scanner was passed directly by the user, in which case a schema + # cannot be passed + if schema is not None: + raise ValueError("Cannot specify a schema when writing a Scanner") + scanner = data + + _filesystemdataset_write( + scanner, base_dir, basename_template, filesystem, partitioning, + file_options, max_partitions, file_visitor, existing_data_behavior, + max_open_files, max_rows_per_file, + min_rows_per_group, max_rows_per_group, create_dir + ) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/device.pxi b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/device.pxi new file mode 100644 index 0000000000000000000000000000000000000000..26256de62093e84075a5bfc3eba9a95d12db6195 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/device.pxi @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + + +cpdef enum DeviceAllocationType: + CPU = CDeviceAllocationType_kCPU + CUDA = CDeviceAllocationType_kCUDA + CUDA_HOST = CDeviceAllocationType_kCUDA_HOST + OPENCL = CDeviceAllocationType_kOPENCL + VULKAN = CDeviceAllocationType_kVULKAN + METAL = CDeviceAllocationType_kMETAL + VPI = CDeviceAllocationType_kVPI + ROCM = CDeviceAllocationType_kROCM + ROCM_HOST = CDeviceAllocationType_kROCM_HOST + EXT_DEV = CDeviceAllocationType_kEXT_DEV + CUDA_MANAGED = CDeviceAllocationType_kCUDA_MANAGED + ONEAPI = CDeviceAllocationType_kONEAPI + WEBGPU = CDeviceAllocationType_kWEBGPU + HEXAGON = CDeviceAllocationType_kHEXAGON + + +cdef object _wrap_device_allocation_type(CDeviceAllocationType device_type): + return DeviceAllocationType( device_type) + + +cdef class Device(_Weakrefable): + """ + Abstract interface for hardware devices + + This object represents a device with access to some memory spaces. + When handling a Buffer or raw memory address, it allows deciding in which + context the raw memory address should be interpreted + (e.g. CPU-accessible memory, or embedded memory on some particular GPU). + """ + + def __init__(self): + raise TypeError("Do not call Device's constructor directly, " + "use the device attribute of the MemoryManager instead.") + + cdef void init(self, const shared_ptr[CDevice]& device): + self.device = device + + @staticmethod + cdef wrap(const shared_ptr[CDevice]& device): + cdef Device self = Device.__new__(Device) + self.init(device) + return self + + cdef inline shared_ptr[CDevice] unwrap(self) nogil: + return self.device + + def __eq__(self, other): + if not isinstance(other, Device): + return False + return self.device.get().Equals(deref((other).device.get())) + + def __repr__(self): + return "".format(frombytes(self.device.get().ToString())) + + @property + def type_name(self): + """ + A shorthand for this device's type. + """ + return frombytes(self.device.get().type_name()) + + @property + def device_id(self): + """ + A device ID to identify this device if there are multiple of this type. + + If there is no "device_id" equivalent (such as for the main CPU device on + non-numa systems) returns -1. + """ + return self.device.get().device_id() + + @property + def is_cpu(self): + """ + Whether this device is the main CPU device. + + This shorthand method is very useful when deciding whether a memory address + is CPU-accessible. + """ + return self.device.get().is_cpu() + + @property + def device_type(self): + """ + Return the DeviceAllocationType of this device. + """ + return _wrap_device_allocation_type(self.device.get().device_type()) + + +cdef class MemoryManager(_Weakrefable): + """ + An object that provides memory management primitives. + + A MemoryManager is always tied to a particular Device instance. + It can also have additional parameters (such as a MemoryPool to + allocate CPU memory). + + """ + + def __init__(self): + raise TypeError("Do not call MemoryManager's constructor directly, " + "use pyarrow.default_cpu_memory_manager() instead.") + + cdef void init(self, const shared_ptr[CMemoryManager]& mm): + self.memory_manager = mm + + @staticmethod + cdef wrap(const shared_ptr[CMemoryManager]& mm): + cdef MemoryManager self = MemoryManager.__new__(MemoryManager) + self.init(mm) + return self + + cdef inline shared_ptr[CMemoryManager] unwrap(self) nogil: + return self.memory_manager + + def __repr__(self): + return "".format( + frombytes(self.memory_manager.get().device().get().ToString()) + ) + + @property + def device(self): + """ + The device this MemoryManager is tied to. + """ + return Device.wrap(self.memory_manager.get().device()) + + @property + def is_cpu(self): + """ + Whether this MemoryManager is tied to the main CPU device. + + This shorthand method is very useful when deciding whether a memory + address is CPU-accessible. + """ + return self.memory_manager.get().is_cpu() + + +def default_cpu_memory_manager(): + """ + Return the default CPU MemoryManager instance. + + The returned singleton instance uses the default MemoryPool. + """ + return MemoryManager.wrap(c_default_cpu_memory_manager()) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/error.pxi b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/error.pxi new file mode 100644 index 0000000000000000000000000000000000000000..cbe25522e8d7ecbb8e0b7e5e024b9c22c56e6e9b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/error.pxi @@ -0,0 +1,274 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from cpython.exc cimport PyErr_CheckSignals, PyErr_SetInterrupt + +from pyarrow.includes.libarrow cimport CStatus +from pyarrow.includes.libarrow_python cimport IsPyError, RestorePyError +from pyarrow.includes.common cimport c_string + +from contextlib import contextmanager +import os +import signal +import threading + +from pyarrow.lib import is_threading_enabled +from pyarrow.util import _break_traceback_cycle_from_frame + + +class ArrowException(Exception): + pass + + +class ArrowInvalid(ValueError, ArrowException): + pass + + +class ArrowMemoryError(MemoryError, ArrowException): + pass + + +class ArrowKeyError(KeyError, ArrowException): + def __str__(self): + # Override KeyError.__str__, as it uses the repr() of the key + return ArrowException.__str__(self) + + +class ArrowTypeError(TypeError, ArrowException): + pass + + +class ArrowNotImplementedError(NotImplementedError, ArrowException): + pass + + +class ArrowCapacityError(ArrowException): + pass + + +class ArrowIndexError(IndexError, ArrowException): + pass + + +class ArrowSerializationError(ArrowException): + pass + + +class ArrowCancelled(ArrowException): + def __init__(self, message, signum=None): + super().__init__(message) + self.signum = signum + + +# Compatibility alias +ArrowIOError = IOError + + +# check_status() and convert_status() could be written directly in C++ +# if we didn't define Arrow-specific subclasses (ArrowInvalid etc.) +cdef int check_status(const CStatus& status) except -1 nogil: + if status.ok(): + return 0 + + with gil: + if IsPyError(status): + RestorePyError(status) + return -1 + + raise convert_status(status) + + +cdef object convert_status(const CStatus& status): + if IsPyError(status): + try: + RestorePyError(status) + except BaseException as e: + return e + + # We don't use Status::ToString() as it would redundantly include + # the C++ class name. + message = frombytes(status.message(), safe=True) + detail = status.detail() + if detail != nullptr: + message += ". Detail: " + frombytes(detail.get().ToString(), + safe=True) + + if status.IsInvalid(): + return ArrowInvalid(message) + elif status.IsIOError(): + # Note: OSError constructor is + # OSError(message) + # or + # OSError(errno, message, filename=None) + # or (on Windows) + # OSError(errno, message, filename, winerror) + errno = ErrnoFromStatus(status) + winerror = WinErrorFromStatus(status) + if winerror != 0: + return IOError(errno, message, None, winerror) + elif errno != 0: + return IOError(errno, message) + else: + return IOError(message) + elif status.IsOutOfMemory(): + return ArrowMemoryError(message) + elif status.IsKeyError(): + return ArrowKeyError(message) + elif status.IsNotImplemented(): + return ArrowNotImplementedError(message) + elif status.IsTypeError(): + return ArrowTypeError(message) + elif status.IsCapacityError(): + return ArrowCapacityError(message) + elif status.IsIndexError(): + return ArrowIndexError(message) + elif status.IsSerializationError(): + return ArrowSerializationError(message) + elif status.IsCancelled(): + signum = SignalFromStatus(status) + if signum > 0: + return ArrowCancelled(message, signum) + else: + return ArrowCancelled(message) + else: + message = frombytes(status.ToString(), safe=True) + return ArrowException(message) + + +# These are API functions for C++ PyArrow +cdef api int pyarrow_internal_check_status(const CStatus& status) \ + except -1 nogil: + return check_status(status) + +cdef api object pyarrow_internal_convert_status(const CStatus& status): + return convert_status(status) + + +cdef class StopToken: + cdef void init(self, CStopToken stop_token): + self.stop_token = move(stop_token) + + +cdef c_bool signal_handlers_enabled = True + + +def enable_signal_handlers(c_bool enable): + """ + Enable or disable interruption of long-running operations. + + By default, certain long running operations will detect user + interruptions, such as by pressing Ctrl-C. This detection relies + on setting a signal handler for the duration of the long-running + operation, and may therefore interfere with other frameworks or + libraries (such as an event loop). + + Parameters + ---------- + enable : bool + Whether to enable user interruption by setting a temporary + signal handler. + """ + global signal_handlers_enabled + signal_handlers_enabled = enable + + +# For internal use + +# Whether we need a workaround for https://bugs.python.org/issue42248 +have_signal_refcycle = (sys.version_info < (3, 8, 10) or + (3, 9) <= sys.version_info < (3, 9, 5) or + sys.version_info[:2] == (3, 10)) + +cdef class SignalStopHandler: + cdef: + StopToken _stop_token + vector[int] _signals + c_bool _enabled + + def __cinit__(self): + self._enabled = False + + self._init_signals() + if have_signal_refcycle: + _break_traceback_cycle_from_frame(sys._getframe(0)) + + self._stop_token = StopToken() + + if not self._signals.empty(): + maybe_source = SetSignalStopSource() + if not maybe_source.ok(): + # See ARROW-11841 / ARROW-17173: in complex interaction + # scenarios (such as R calling into Python), SetSignalStopSource() + # may have already activated a signal-receiving StopSource. + # Just warn instead of erroring out. + maybe_source.status().Warn() + else: + self._stop_token.init(deref(maybe_source).token()) + # signals don't work on Emscripten without threads. + # and possibly other single-thread environments. + self._enabled = is_threading_enabled() + + def _init_signals(self): + if (signal_handlers_enabled and + threading.current_thread() is threading.main_thread()): + self._signals = [ + sig for sig in (signal.SIGINT, signal.SIGTERM) + if signal.getsignal(sig) not in (signal.SIG_DFL, + signal.SIG_IGN, None)] + + def __enter__(self): + if self._enabled: + check_status(RegisterCancellingSignalHandler(self._signals)) + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + if self._enabled: + UnregisterCancellingSignalHandler() + if exc_value is None: + # Make sure we didn't lose a signal + try: + check_status(self._stop_token.stop_token.Poll()) + except ArrowCancelled as e: + exc_value = e + if isinstance(exc_value, ArrowCancelled): + if exc_value.signum: + # Re-emit the exact same signal. We restored the Python signal + # handler above, so it should receive it. + if os.name == 'nt': + SendSignal(exc_value.signum) + else: + SendSignalToThread(exc_value.signum, + threading.main_thread().ident) + else: + # Simulate Python receiving a SIGINT + # (see https://bugs.python.org/issue43356 for why we can't + # simulate the exact signal number) + PyErr_SetInterrupt() + # Maximize chances of the Python signal handler being executed now. + # Otherwise a potential KeyboardInterrupt might be missed by an + # immediately enclosing try/except block. + PyErr_CheckSignals() + # ArrowCancelled will be re-raised if PyErr_CheckSignals() + # returned successfully. + + def __dealloc__(self): + if self._enabled: + ResetSignalStopSource() + + @property + def stop_token(self): + return self._stop_token diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/feather.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/feather.py new file mode 100644 index 0000000000000000000000000000000000000000..fbd0602597006734d66a9a965ea462fb35cbe178 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/feather.py @@ -0,0 +1,277 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import os + +from pyarrow.pandas_compat import _pandas_api # noqa +from pyarrow.lib import (Codec, Table, # noqa + concat_tables, schema) +import pyarrow.lib as ext +from pyarrow import _feather +from pyarrow._feather import FeatherError # noqa: F401 + + +class FeatherDataset: + """ + Encapsulates details of reading a list of Feather files. + + Parameters + ---------- + path_or_paths : List[str] + A list of file names + validate_schema : bool, default True + Check that individual file schemas are all the same / compatible + """ + + def __init__(self, path_or_paths, validate_schema=True): + self.paths = path_or_paths + self.validate_schema = validate_schema + + def read_table(self, columns=None): + """ + Read multiple feather files as a single pyarrow.Table + + Parameters + ---------- + columns : List[str] + Names of columns to read from the file + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns) + """ + _fil = read_table(self.paths[0], columns=columns) + self._tables = [_fil] + self.schema = _fil.schema + + for path in self.paths[1:]: + table = read_table(path, columns=columns) + if self.validate_schema: + self.validate_schemas(path, table) + self._tables.append(table) + return concat_tables(self._tables) + + def validate_schemas(self, piece, table): + if not self.schema.equals(table.schema): + raise ValueError('Schema in {!s} was different. \n' + '{!s}\n\nvs\n\n{!s}' + .format(piece, self.schema, + table.schema)) + + def read_pandas(self, columns=None, use_threads=True): + """ + Read multiple Parquet files as a single pandas DataFrame + + Parameters + ---------- + columns : List[str] + Names of columns to read from the file + use_threads : bool, default True + Use multiple threads when converting to pandas + + Returns + ------- + pandas.DataFrame + Content of the file as a pandas DataFrame (of columns) + """ + return self.read_table(columns=columns).to_pandas( + use_threads=use_threads) + + +def check_chunked_overflow(name, col): + if col.num_chunks == 1: + return + + if col.type in (ext.binary(), ext.string()): + raise ValueError("Column '{}' exceeds 2GB maximum capacity of " + "a Feather binary column. This restriction may be " + "lifted in the future".format(name)) + else: + # TODO(wesm): Not sure when else this might be reached + raise ValueError("Column '{}' of type {} was chunked on conversion " + "to Arrow and cannot be currently written to " + "Feather format".format(name, str(col.type))) + + +_FEATHER_SUPPORTED_CODECS = {'lz4', 'zstd', 'uncompressed'} + + +def write_feather(df, dest, compression=None, compression_level=None, + chunksize=None, version=2): + """ + Write a pandas.DataFrame to Feather format. + + Parameters + ---------- + df : pandas.DataFrame or pyarrow.Table + Data to write out as Feather format. + dest : str + Local destination path. + compression : string, default None + Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses + LZ4 for V2 files if it is available, otherwise uncompressed. + compression_level : int, default None + Use a compression level particular to the chosen compressor. If None + use the default compression level + chunksize : int, default None + For V2 files, the internal maximum size of Arrow RecordBatch chunks + when writing the Arrow IPC file format. None means use the default, + which is currently 64K + version : int, default 2 + Feather file version. Version 2 is the current. Version 1 is the more + limited legacy format + """ + if _pandas_api.have_pandas: + if (_pandas_api.has_sparse and + isinstance(df, _pandas_api.pd.SparseDataFrame)): + df = df.to_dense() + + if _pandas_api.is_data_frame(df): + # Feather v1 creates a new column in the resultant Table to + # store index information if index type is not RangeIndex + + if version == 1: + preserve_index = False + elif version == 2: + preserve_index = None + else: + raise ValueError("Version value should either be 1 or 2") + + table = Table.from_pandas(df, preserve_index=preserve_index) + + if version == 1: + # Version 1 does not chunking + for i, name in enumerate(table.schema.names): + col = table[i] + check_chunked_overflow(name, col) + else: + table = df + + if version == 1: + if len(table.column_names) > len(set(table.column_names)): + raise ValueError("cannot serialize duplicate column names") + + if compression is not None: + raise ValueError("Feather V1 files do not support compression " + "option") + + if chunksize is not None: + raise ValueError("Feather V1 files do not support chunksize " + "option") + else: + if compression is None and Codec.is_available('lz4_frame'): + compression = 'lz4' + elif (compression is not None and + compression not in _FEATHER_SUPPORTED_CODECS): + raise ValueError('compression="{}" not supported, must be ' + 'one of {}'.format(compression, + _FEATHER_SUPPORTED_CODECS)) + + try: + _feather.write_feather(table, dest, compression=compression, + compression_level=compression_level, + chunksize=chunksize, version=version) + except Exception: + if isinstance(dest, str): + try: + os.remove(dest) + except os.error: + pass + raise + + +def read_feather(source, columns=None, use_threads=True, + memory_map=False, **kwargs): + """ + Read a pandas.DataFrame from Feather format. To read as pyarrow.Table use + feather.read_table. + + Parameters + ---------- + source : str file path, or file-like object + You can use MemoryMappedFile as source, for explicitly use memory map. + columns : sequence, optional + Only read a specific set of columns. If not provided, all columns are + read. + use_threads : bool, default True + Whether to parallelize reading using multiple threads. If false the + restriction is used in the conversion to Pandas as well as in the + reading from Feather format. + memory_map : boolean, default False + Use memory mapping when opening file on disk, when source is a str. + **kwargs + Additional keyword arguments passed on to `pyarrow.Table.to_pandas`. + + Returns + ------- + df : pandas.DataFrame + The contents of the Feather file as a pandas.DataFrame + """ + return (read_table( + source, columns=columns, memory_map=memory_map, + use_threads=use_threads).to_pandas(use_threads=use_threads, **kwargs)) + + +def read_table(source, columns=None, memory_map=False, use_threads=True): + """ + Read a pyarrow.Table from Feather format + + Parameters + ---------- + source : str file path, or file-like object + You can use MemoryMappedFile as source, for explicitly use memory map. + columns : sequence, optional + Only read a specific set of columns. If not provided, all columns are + read. + memory_map : boolean, default False + Use memory mapping when opening file on disk, when source is a str + use_threads : bool, default True + Whether to parallelize reading using multiple threads. + + Returns + ------- + table : pyarrow.Table + The contents of the Feather file as a pyarrow.Table + """ + reader = _feather.FeatherReader( + source, use_memory_map=memory_map, use_threads=use_threads) + + if columns is None: + return reader.read() + + column_types = [type(column) for column in columns] + if all(map(lambda t: t == int, column_types)): + table = reader.read_indices(columns) + elif all(map(lambda t: t == str, column_types)): + table = reader.read_names(columns) + else: + column_type_names = [t.__name__ for t in column_types] + raise TypeError("Columns must be indices or names. " + "Got columns {} of types {}" + .format(columns, column_type_names)) + + # Feather v1 already respects the column selection + if reader.version < 3: + return table + # Feather v2 reads with sorted / deduplicated selection + elif sorted(set(columns)) == columns: + return table + else: + # follow exact order / selection of names + return table.select(columns) diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/flight.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/flight.py new file mode 100644 index 0000000000000000000000000000000000000000..b1836907c6744161c86f32e873316923c60b4226 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/flight.py @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +try: + from pyarrow._flight import ( # noqa:F401 + connect, + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + ) +except ImportError as exc: + raise ImportError( + f"The pyarrow installation is not built with support for 'flight' ({str(exc)})" + ) from None diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/fs.py b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/fs.py new file mode 100644 index 0000000000000000000000000000000000000000..abdd1a995751aa32aeba2a84176747e22bc64744 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/fs.py @@ -0,0 +1,431 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +FileSystem abstraction to interact with various local and remote filesystems. +""" + +from pyarrow.util import _is_path_like, _stringify_path + +from pyarrow._fs import ( # noqa + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, + _copy_files, + _copy_files_selector, +) + +# For backward compatibility. +FileStats = FileInfo + +_not_imported = [] +try: + from pyarrow._azurefs import AzureFileSystem # noqa +except ImportError: + _not_imported.append("AzureFileSystem") + +try: + from pyarrow._hdfs import HadoopFileSystem # noqa +except ImportError: + _not_imported.append("HadoopFileSystem") + +try: + from pyarrow._gcsfs import GcsFileSystem # noqa +except ImportError: + _not_imported.append("GcsFileSystem") + +try: + from pyarrow._s3fs import ( # noqa + AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy, + S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized, + finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region) +except ImportError: + _not_imported.append("S3FileSystem") +else: + # GH-38364: we don't initialize S3 eagerly as that could lead + # to crashes at shutdown even when S3 isn't used. + # Instead, S3 is initialized lazily using `ensure_s3_initialized` + # in assorted places. + import atexit + atexit.register(ensure_s3_finalized) + + +def __getattr__(name): + if name in _not_imported: + raise ImportError( + "The pyarrow installation is not built with support for " + "'{0}'".format(name) + ) + + raise AttributeError( + "module 'pyarrow.fs' has no attribute '{0}'".format(name) + ) + + +def _filesystem_from_str(uri): + # instantiate the file system from an uri, if the uri has a path + # component then it will be treated as a path prefix + filesystem, prefix = FileSystem.from_uri(uri) + prefix = filesystem.normalize_path(prefix) + if prefix: + # validate that the prefix is pointing to a directory + prefix_info = filesystem.get_file_info([prefix])[0] + if prefix_info.type != FileType.Directory: + raise ValueError( + "The path component of the filesystem URI must point to a " + "directory but it has a type: `{}`. The path component " + "is `{}` and the given filesystem URI is `{}`".format( + prefix_info.type.name, prefix_info.path, uri + ) + ) + filesystem = SubTreeFileSystem(prefix, filesystem) + return filesystem + + +def _ensure_filesystem(filesystem, *, use_mmap=False): + if isinstance(filesystem, FileSystem): + return filesystem + elif isinstance(filesystem, str): + if use_mmap: + raise ValueError( + "Specifying to use memory mapping not supported for " + "filesystem specified as an URI string" + ) + return _filesystem_from_str(filesystem) + + # handle fsspec-compatible filesystems + try: + import fsspec + except ImportError: + pass + else: + if isinstance(filesystem, fsspec.AbstractFileSystem): + if type(filesystem).__name__ == 'LocalFileSystem': + # In case its a simple LocalFileSystem, use native arrow one + return LocalFileSystem(use_mmap=use_mmap) + return PyFileSystem(FSSpecHandler(filesystem)) + + raise TypeError( + "Unrecognized filesystem: {}. `filesystem` argument must be a " + "FileSystem instance or a valid file system URI'".format( + type(filesystem)) + ) + + +def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False): + """ + Return filesystem/path from path which could be an URI or a plain + filesystem path. + """ + if not _is_path_like(path): + if filesystem is not None: + raise ValueError( + "'filesystem' passed but the specified path is file-like, so" + " there is nothing to open with 'filesystem'." + ) + return filesystem, path + + if filesystem is not None: + filesystem = _ensure_filesystem(filesystem, use_mmap=memory_map) + if isinstance(filesystem, LocalFileSystem): + path = _stringify_path(path) + elif not isinstance(path, str): + raise TypeError( + "Expected string path; path-like objects are only allowed " + "with a local filesystem" + ) + path = filesystem.normalize_path(path) + return filesystem, path + + path = _stringify_path(path) + + # if filesystem is not given, try to automatically determine one + # first check if the file exists as a local (relative) file path + # if not then try to parse the path as an URI + filesystem = LocalFileSystem(use_mmap=memory_map) + + try: + file_info = filesystem.get_file_info(path) + except ValueError: # ValueError means path is likely an URI + file_info = None + exists_locally = False + else: + exists_locally = (file_info.type != FileType.NotFound) + + # if the file or directory doesn't exists locally, then assume that + # the path is an URI describing the file system as well + if not exists_locally: + try: + filesystem, path = FileSystem.from_uri(path) + except ValueError as e: + # neither an URI nor a locally existing path, so assume that + # local path was given and propagate a nicer file not found error + # instead of a more confusing scheme parsing error + if "empty scheme" not in str(e) \ + and "Cannot parse URI" not in str(e): + raise + else: + path = filesystem.normalize_path(path) + + return filesystem, path + + +def copy_files(source, destination, + source_filesystem=None, destination_filesystem=None, + *, chunk_size=1024*1024, use_threads=True): + """ + Copy files between FileSystems. + + This functions allows you to recursively copy directories of files from + one file system to another, such as from S3 to your local machine. + + Parameters + ---------- + source : string + Source file path or URI to a single file or directory. + If a directory, files will be copied recursively from this path. + destination : string + Destination file path or URI. If `source` is a file, `destination` + is also interpreted as the destination file (not directory). + Directories will be created as necessary. + source_filesystem : FileSystem, optional + Source filesystem, needs to be specified if `source` is not a URI, + otherwise inferred. + destination_filesystem : FileSystem, optional + Destination filesystem, needs to be specified if `destination` is not + a URI, otherwise inferred. + chunk_size : int, default 1MB + The maximum size of block to read before flushing to the + destination file. A larger chunk_size will use more memory while + copying but may help accommodate high latency FileSystems. + use_threads : bool, default True + Whether to use multiple threads to accelerate copying. + + Examples + -------- + Inspect an S3 bucket's files: + + >>> s3, path = fs.FileSystem.from_uri( + ... "s3://registry.opendata.aws/roda/ndjson/") + >>> selector = fs.FileSelector(path) + >>> s3.get_file_info(selector) + [>> fs.copy_files("s3://registry.opendata.aws/roda/ndjson/index.ndjson", + ... "file:///{}/index_copy.ndjson".format(local_path)) + + >>> fs.LocalFileSystem().get_file_info(str(local_path)+ + ... '/index_copy.ndjson') + + + Copy file using a FileSystem object: + + >>> fs.copy_files("registry.opendata.aws/roda/ndjson/index.ndjson", + ... "file:///{}/index_copy.ndjson".format(local_path), + ... source_filesystem=fs.S3FileSystem()) + """ + source_fs, source_path = _resolve_filesystem_and_path( + source, source_filesystem + ) + destination_fs, destination_path = _resolve_filesystem_and_path( + destination, destination_filesystem + ) + + file_info = source_fs.get_file_info(source_path) + if file_info.type == FileType.Directory: + source_sel = FileSelector(source_path, recursive=True) + _copy_files_selector(source_fs, source_sel, + destination_fs, destination_path, + chunk_size, use_threads) + else: + _copy_files(source_fs, source_path, + destination_fs, destination_path, + chunk_size, use_threads) + + +class FSSpecHandler(FileSystemHandler): + """ + Handler for fsspec-based Python filesystems. + + https://filesystem-spec.readthedocs.io/en/latest/index.html + + Parameters + ---------- + fs : FSSpec-compliant filesystem instance + + Examples + -------- + >>> PyFileSystem(FSSpecHandler(fsspec_fs)) # doctest: +SKIP + """ + + def __init__(self, fs): + self.fs = fs + + def __eq__(self, other): + if isinstance(other, FSSpecHandler): + return self.fs == other.fs + return NotImplemented + + def __ne__(self, other): + if isinstance(other, FSSpecHandler): + return self.fs != other.fs + return NotImplemented + + def get_type_name(self): + protocol = self.fs.protocol + if isinstance(protocol, list): + protocol = protocol[0] + return "fsspec+{0}".format(protocol) + + def normalize_path(self, path): + return path + + @staticmethod + def _create_file_info(path, info): + size = info["size"] + if info["type"] == "file": + ftype = FileType.File + elif info["type"] == "directory": + ftype = FileType.Directory + # some fsspec filesystems include a file size for directories + size = None + else: + ftype = FileType.Unknown + return FileInfo(path, ftype, size=size, mtime=info.get("mtime", None)) + + def get_file_info(self, paths): + infos = [] + for path in paths: + try: + info = self.fs.info(path) + except FileNotFoundError: + infos.append(FileInfo(path, FileType.NotFound)) + else: + infos.append(self._create_file_info(path, info)) + return infos + + def get_file_info_selector(self, selector): + if not self.fs.isdir(selector.base_dir): + if self.fs.exists(selector.base_dir): + raise NotADirectoryError(selector.base_dir) + else: + if selector.allow_not_found: + return [] + else: + raise FileNotFoundError(selector.base_dir) + + if selector.recursive: + maxdepth = None + else: + maxdepth = 1 + + infos = [] + selected_files = self.fs.find( + selector.base_dir, maxdepth=maxdepth, withdirs=True, detail=True + ) + for path, info in selected_files.items(): + _path = path.strip("/") + base_dir = selector.base_dir.strip("/") + # Need to exclude base directory from selected files if present + # (fsspec filesystems, see GH-37555) + if _path != base_dir: + infos.append(self._create_file_info(path, info)) + + return infos + + def create_dir(self, path, recursive): + # mkdir also raises FileNotFoundError when base directory is not found + try: + self.fs.mkdir(path, create_parents=recursive) + except FileExistsError: + pass + + def delete_dir(self, path): + self.fs.rm(path, recursive=True) + + def _delete_dir_contents(self, path, missing_dir_ok): + try: + subpaths = self.fs.listdir(path, detail=False) + except FileNotFoundError: + if missing_dir_ok: + return + raise + for subpath in subpaths: + if self.fs.isdir(subpath): + self.fs.rm(subpath, recursive=True) + elif self.fs.isfile(subpath): + self.fs.rm(subpath) + + def delete_dir_contents(self, path, missing_dir_ok): + if path.strip("/") == "": + raise ValueError( + "delete_dir_contents called on path '", path, "'") + self._delete_dir_contents(path, missing_dir_ok) + + def delete_root_dir_contents(self): + self._delete_dir_contents("/") + + def delete_file(self, path): + # fs.rm correctly raises IsADirectoryError when `path` is a directory + # instead of a file and `recursive` is not set to True + if not self.fs.exists(path): + raise FileNotFoundError(path) + self.fs.rm(path) + + def move(self, src, dest): + self.fs.mv(src, dest, recursive=True) + + def copy_file(self, src, dest): + # fs.copy correctly raises IsADirectoryError when `src` is a directory + # instead of a file + self.fs.copy(src, dest) + + # TODO can we read/pass metadata (e.g. Content-Type) in the methods below? + + def open_input_stream(self, path): + from pyarrow import PythonFile + + if not self.fs.isfile(path): + raise FileNotFoundError(path) + + return PythonFile(self.fs.open(path, mode="rb"), mode="r") + + def open_input_file(self, path): + from pyarrow import PythonFile + + if not self.fs.isfile(path): + raise FileNotFoundError(path) + + return PythonFile(self.fs.open(path, mode="rb"), mode="r") + + def open_output_stream(self, path, metadata): + from pyarrow import PythonFile + + return PythonFile(self.fs.open(path, mode="wb"), mode="w") + + def open_append_stream(self, path, metadata): + from pyarrow import PythonFile + + return PythonFile(self.fs.open(path, mode="ab"), mode="w") diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/gandiva.pyx b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/gandiva.pyx new file mode 100644 index 0000000000000000000000000000000000000000..2202ec64f29628d76143759220eb61102d1bea97 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/gandiva.pyx @@ -0,0 +1,760 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: language_level = 3 + +from libcpp.memory cimport shared_ptr +from libcpp.string cimport string as c_string +from libcpp.vector cimport vector as c_vector +from libcpp.unordered_set cimport unordered_set as c_unordered_set +from libc.stdint cimport int64_t, int32_t + +from pyarrow.includes.libarrow cimport * +from pyarrow.lib cimport (DataType, Field, MemoryPool, RecordBatch, + Schema, check_status, pyarrow_wrap_array, + pyarrow_wrap_data_type, ensure_type, _Weakrefable, + pyarrow_wrap_field) + +from pyarrow.includes.libgandiva cimport ( + CCondition, CGandivaExpression, + CNode, CProjector, CFilter, + CSelectionVector, + _ensure_selection_mode, + CConfiguration, + CConfigurationBuilder, + TreeExprBuilder_MakeExpression, + TreeExprBuilder_MakeFunction, + TreeExprBuilder_MakeBoolLiteral, + TreeExprBuilder_MakeUInt8Literal, + TreeExprBuilder_MakeUInt16Literal, + TreeExprBuilder_MakeUInt32Literal, + TreeExprBuilder_MakeUInt64Literal, + TreeExprBuilder_MakeInt8Literal, + TreeExprBuilder_MakeInt16Literal, + TreeExprBuilder_MakeInt32Literal, + TreeExprBuilder_MakeInt64Literal, + TreeExprBuilder_MakeFloatLiteral, + TreeExprBuilder_MakeDoubleLiteral, + TreeExprBuilder_MakeStringLiteral, + TreeExprBuilder_MakeBinaryLiteral, + TreeExprBuilder_MakeField, + TreeExprBuilder_MakeIf, + TreeExprBuilder_MakeAnd, + TreeExprBuilder_MakeOr, + TreeExprBuilder_MakeCondition, + TreeExprBuilder_MakeInExpressionInt32, + TreeExprBuilder_MakeInExpressionInt64, + TreeExprBuilder_MakeInExpressionTime32, + TreeExprBuilder_MakeInExpressionTime64, + TreeExprBuilder_MakeInExpressionDate32, + TreeExprBuilder_MakeInExpressionDate64, + TreeExprBuilder_MakeInExpressionTimeStamp, + TreeExprBuilder_MakeInExpressionString, + SelectionVector_MakeInt16, + SelectionVector_MakeInt32, + SelectionVector_MakeInt64, + Projector_Make, + Filter_Make, + CFunctionSignature, + GetRegisteredFunctionSignatures) + + +cdef class Node(_Weakrefable): + cdef: + shared_ptr[CNode] node + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly, use the " + "TreeExprBuilder API directly" + .format(self.__class__.__name__)) + + @staticmethod + cdef create(shared_ptr[CNode] node): + cdef Node self = Node.__new__(Node) + self.node = node + return self + + def __str__(self): + return self.node.get().ToString().decode() + + def __repr__(self): + type_format = object.__repr__(self) + return '{0}\n{1}'.format(type_format, str(self)) + + def return_type(self): + return pyarrow_wrap_data_type(self.node.get().return_type()) + + +cdef class Expression(_Weakrefable): + cdef: + shared_ptr[CGandivaExpression] expression + + cdef void init(self, shared_ptr[CGandivaExpression] expression): + self.expression = expression + + def __str__(self): + return self.expression.get().ToString().decode() + + def __repr__(self): + type_format = object.__repr__(self) + return '{0}\n{1}'.format(type_format, str(self)) + + def root(self): + return Node.create(self.expression.get().root()) + + def result(self): + return pyarrow_wrap_field(self.expression.get().result()) + + +cdef class Condition(_Weakrefable): + cdef: + shared_ptr[CCondition] condition + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly, use the " + "TreeExprBuilder API instead" + .format(self.__class__.__name__)) + + @staticmethod + cdef create(shared_ptr[CCondition] condition): + cdef Condition self = Condition.__new__(Condition) + self.condition = condition + return self + + def __str__(self): + return self.condition.get().ToString().decode() + + def __repr__(self): + type_format = object.__repr__(self) + return '{0}\n{1}'.format(type_format, str(self)) + + def root(self): + return Node.create(self.condition.get().root()) + + def result(self): + return pyarrow_wrap_field(self.condition.get().result()) + + +cdef class SelectionVector(_Weakrefable): + cdef: + shared_ptr[CSelectionVector] selection_vector + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly." + .format(self.__class__.__name__)) + + @staticmethod + cdef create(shared_ptr[CSelectionVector] selection_vector): + cdef SelectionVector self = SelectionVector.__new__(SelectionVector) + self.selection_vector = selection_vector + return self + + def to_array(self): + cdef shared_ptr[CArray] result = self.selection_vector.get().ToArray() + return pyarrow_wrap_array(result) + + +cdef class Projector(_Weakrefable): + cdef: + shared_ptr[CProjector] projector + MemoryPool pool + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly, use " + "make_projector instead" + .format(self.__class__.__name__)) + + @staticmethod + cdef create(shared_ptr[CProjector] projector, MemoryPool pool): + cdef Projector self = Projector.__new__(Projector) + self.projector = projector + self.pool = pool + return self + + @property + def llvm_ir(self): + return self.projector.get().DumpIR().decode() + + def evaluate(self, RecordBatch batch, SelectionVector selection=None): + """ + Evaluate the specified record batch and return the arrays at the + filtered positions. + + Parameters + ---------- + batch : pyarrow.RecordBatch + selection : pyarrow.gandiva.SelectionVector + + Returns + ------- + list[pyarrow.Array] + """ + cdef vector[shared_ptr[CArray]] results + if selection is None: + check_status(self.projector.get().Evaluate( + batch.sp_batch.get()[0], self.pool.pool, &results)) + else: + check_status( + self.projector.get().Evaluate( + batch.sp_batch.get()[0], selection.selection_vector.get(), + self.pool.pool, &results)) + cdef shared_ptr[CArray] result + arrays = [] + for result in results: + arrays.append(pyarrow_wrap_array(result)) + return arrays + + +cdef class Filter(_Weakrefable): + cdef: + shared_ptr[CFilter] filter + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly, use " + "make_filter instead" + .format(self.__class__.__name__)) + + @staticmethod + cdef create(shared_ptr[CFilter] filter): + cdef Filter self = Filter.__new__(Filter) + self.filter = filter + return self + + @property + def llvm_ir(self): + return self.filter.get().DumpIR().decode() + + def evaluate(self, RecordBatch batch, MemoryPool pool, dtype='int32'): + """ + Evaluate the specified record batch and return a selection vector. + + Parameters + ---------- + batch : pyarrow.RecordBatch + pool : MemoryPool + dtype : DataType or str, default int32 + + Returns + ------- + pyarrow.gandiva.SelectionVector + """ + cdef: + DataType type = ensure_type(dtype) + shared_ptr[CSelectionVector] selection + + if type.id == _Type_INT16: + check_status(SelectionVector_MakeInt16( + batch.num_rows, pool.pool, &selection)) + elif type.id == _Type_INT32: + check_status(SelectionVector_MakeInt32( + batch.num_rows, pool.pool, &selection)) + elif type.id == _Type_INT64: + check_status(SelectionVector_MakeInt64( + batch.num_rows, pool.pool, &selection)) + else: + raise ValueError("'dtype' of the selection vector should be " + "one of 'int16', 'int32' and 'int64'.") + + check_status(self.filter.get().Evaluate( + batch.sp_batch.get()[0], selection)) + return SelectionVector.create(selection) + + +cdef class TreeExprBuilder(_Weakrefable): + + def make_literal(self, value, dtype): + """ + Create a node on a literal. + + Parameters + ---------- + value : a literal value + dtype : DataType + + Returns + ------- + pyarrow.gandiva.Node + """ + cdef: + DataType type = ensure_type(dtype) + shared_ptr[CNode] r + + if type.id == _Type_BOOL: + r = TreeExprBuilder_MakeBoolLiteral(value) + elif type.id == _Type_UINT8: + r = TreeExprBuilder_MakeUInt8Literal(value) + elif type.id == _Type_UINT16: + r = TreeExprBuilder_MakeUInt16Literal(value) + elif type.id == _Type_UINT32: + r = TreeExprBuilder_MakeUInt32Literal(value) + elif type.id == _Type_UINT64: + r = TreeExprBuilder_MakeUInt64Literal(value) + elif type.id == _Type_INT8: + r = TreeExprBuilder_MakeInt8Literal(value) + elif type.id == _Type_INT16: + r = TreeExprBuilder_MakeInt16Literal(value) + elif type.id == _Type_INT32: + r = TreeExprBuilder_MakeInt32Literal(value) + elif type.id == _Type_INT64: + r = TreeExprBuilder_MakeInt64Literal(value) + elif type.id == _Type_FLOAT: + r = TreeExprBuilder_MakeFloatLiteral(value) + elif type.id == _Type_DOUBLE: + r = TreeExprBuilder_MakeDoubleLiteral(value) + elif type.id == _Type_STRING: + r = TreeExprBuilder_MakeStringLiteral(value.encode('UTF-8')) + elif type.id == _Type_BINARY: + r = TreeExprBuilder_MakeBinaryLiteral(value) + else: + raise TypeError("Didn't recognize dtype " + str(dtype)) + + return Node.create(r) + + def make_expression(self, Node root_node not None, + Field return_field not None): + """ + Create an expression with the specified root_node, + and the result written to result_field. + + Parameters + ---------- + root_node : pyarrow.gandiva.Node + return_field : pyarrow.Field + + Returns + ------- + pyarrow.gandiva.Expression + """ + cdef shared_ptr[CGandivaExpression] r = TreeExprBuilder_MakeExpression( + root_node.node, return_field.sp_field) + cdef Expression expression = Expression() + expression.init(r) + return expression + + def make_function(self, name, children, DataType return_type): + """ + Create a node with a function. + + Parameters + ---------- + name : str + children : pyarrow.gandiva.NodeVector + return_type : DataType + + Returns + ------- + pyarrow.gandiva.Node + """ + cdef c_vector[shared_ptr[CNode]] c_children + cdef Node child + for child in children: + if child is None: + raise TypeError("Child nodes must not be None") + c_children.push_back(child.node) + cdef shared_ptr[CNode] r = TreeExprBuilder_MakeFunction( + name.encode(), c_children, return_type.sp_type) + return Node.create(r) + + def make_field(self, Field field not None): + """ + Create a node with an Arrow field. + + Parameters + ---------- + field : pyarrow.Field + + Returns + ------- + pyarrow.gandiva.Node + """ + cdef shared_ptr[CNode] r = TreeExprBuilder_MakeField(field.sp_field) + return Node.create(r) + + def make_if(self, Node condition not None, Node this_node not None, + Node else_node not None, DataType return_type not None): + """ + Create a node with an if-else expression. + + Parameters + ---------- + condition : pyarrow.gandiva.Node + this_node : pyarrow.gandiva.Node + else_node : pyarrow.gandiva.Node + return_type : DataType + + Returns + ------- + pyarrow.gandiva.Node + """ + cdef shared_ptr[CNode] r = TreeExprBuilder_MakeIf( + condition.node, this_node.node, else_node.node, + return_type.sp_type) + return Node.create(r) + + def make_and(self, children): + """ + Create a Node with a boolean AND expression. + + Parameters + ---------- + children : list[pyarrow.gandiva.Node] + + Returns + ------- + pyarrow.gandiva.Node + """ + cdef c_vector[shared_ptr[CNode]] c_children + cdef Node child + for child in children: + if child is None: + raise TypeError("Child nodes must not be None") + c_children.push_back(child.node) + cdef shared_ptr[CNode] r = TreeExprBuilder_MakeAnd(c_children) + return Node.create(r) + + def make_or(self, children): + """ + Create a Node with a boolean OR expression. + + Parameters + ---------- + children : list[pyarrow.gandiva.Node] + + Returns + ------- + pyarrow.gandiva.Node + """ + cdef c_vector[shared_ptr[CNode]] c_children + cdef Node child + for child in children: + if child is None: + raise TypeError("Child nodes must not be None") + c_children.push_back(child.node) + cdef shared_ptr[CNode] r = TreeExprBuilder_MakeOr(c_children) + return Node.create(r) + + def _make_in_expression_int32(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[int32_t] c_values + cdef int32_t v + for v in values: + c_values.insert(v) + r = TreeExprBuilder_MakeInExpressionInt32(node.node, c_values) + return Node.create(r) + + def _make_in_expression_int64(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[int64_t] c_values + cdef int64_t v + for v in values: + c_values.insert(v) + r = TreeExprBuilder_MakeInExpressionInt64(node.node, c_values) + return Node.create(r) + + def _make_in_expression_time32(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[int32_t] c_values + cdef int32_t v + for v in values: + c_values.insert(v) + r = TreeExprBuilder_MakeInExpressionTime32(node.node, c_values) + return Node.create(r) + + def _make_in_expression_time64(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[int64_t] c_values + cdef int64_t v + for v in values: + c_values.insert(v) + r = TreeExprBuilder_MakeInExpressionTime64(node.node, c_values) + return Node.create(r) + + def _make_in_expression_date32(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[int32_t] c_values + cdef int32_t v + for v in values: + c_values.insert(v) + r = TreeExprBuilder_MakeInExpressionDate32(node.node, c_values) + return Node.create(r) + + def _make_in_expression_date64(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[int64_t] c_values + cdef int64_t v + for v in values: + c_values.insert(v) + r = TreeExprBuilder_MakeInExpressionDate64(node.node, c_values) + return Node.create(r) + + def _make_in_expression_timestamp(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[int64_t] c_values + cdef int64_t v + for v in values: + c_values.insert(v) + r = TreeExprBuilder_MakeInExpressionTimeStamp(node.node, c_values) + return Node.create(r) + + def _make_in_expression_binary(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[c_string] c_values + cdef c_string v + for v in values: + c_values.insert(v) + r = TreeExprBuilder_MakeInExpressionString(node.node, c_values) + return Node.create(r) + + def _make_in_expression_string(self, Node node not None, values): + cdef shared_ptr[CNode] r + cdef c_unordered_set[c_string] c_values + cdef c_string _v + for v in values: + _v = v.encode('UTF-8') + c_values.insert(_v) + r = TreeExprBuilder_MakeInExpressionString(node.node, c_values) + return Node.create(r) + + def make_in_expression(self, Node node not None, values, dtype): + """ + Create a Node with an IN expression. + + Parameters + ---------- + node : pyarrow.gandiva.Node + values : iterable + dtype : DataType + + Returns + ------- + pyarrow.gandiva.Node + """ + cdef DataType type = ensure_type(dtype) + + if type.id == _Type_INT32: + return self._make_in_expression_int32(node, values) + elif type.id == _Type_INT64: + return self._make_in_expression_int64(node, values) + elif type.id == _Type_TIME32: + return self._make_in_expression_time32(node, values) + elif type.id == _Type_TIME64: + return self._make_in_expression_time64(node, values) + elif type.id == _Type_TIMESTAMP: + return self._make_in_expression_timestamp(node, values) + elif type.id == _Type_DATE32: + return self._make_in_expression_date32(node, values) + elif type.id == _Type_DATE64: + return self._make_in_expression_date64(node, values) + elif type.id == _Type_BINARY: + return self._make_in_expression_binary(node, values) + elif type.id == _Type_STRING: + return self._make_in_expression_string(node, values) + else: + raise TypeError("Data type " + str(dtype) + " not supported.") + + def make_condition(self, Node condition not None): + """ + Create a condition with the specified node. + + Parameters + ---------- + condition : pyarrow.gandiva.Node + + Returns + ------- + pyarrow.gandiva.Condition + """ + cdef shared_ptr[CCondition] r = TreeExprBuilder_MakeCondition( + condition.node) + return Condition.create(r) + +cdef class Configuration(_Weakrefable): + cdef: + shared_ptr[CConfiguration] configuration + + def __cinit__(self, bint optimize=True, bint dump_ir=False): + """ + Initialize the configuration with specified options. + + Parameters + ---------- + optimize : bool, default True + Whether to enable optimizations. + dump_ir : bool, default False + Whether to dump LLVM IR. + """ + self.configuration = CConfigurationBuilder().build() + self.configuration.get().set_optimize(optimize) + self.configuration.get().set_dump_ir(dump_ir) + + @staticmethod + cdef create(shared_ptr[CConfiguration] configuration): + """ + Create a Configuration instance from an existing CConfiguration pointer. + + Parameters + ---------- + configuration : shared_ptr[CConfiguration] + Existing CConfiguration pointer. + + Returns + ------- + Configuration instance + """ + cdef Configuration self = Configuration.__new__(Configuration) + self.configuration = configuration + return self + + +cpdef make_projector(Schema schema, children, MemoryPool pool, + str selection_mode="NONE", + Configuration configuration=None): + """ + Construct a projection using expressions. + + A projector is built for a specific schema and vector of expressions. + Once the projector is built, it can be used to evaluate many row batches. + + Parameters + ---------- + schema : pyarrow.Schema + Schema for the record batches, and the expressions. + children : list[pyarrow.gandiva.Expression] + List of projectable expression objects. + pool : pyarrow.MemoryPool + Memory pool used to allocate output arrays. + selection_mode : str, default "NONE" + Possible values are NONE, UINT16, UINT32, UINT64. + configuration : pyarrow.gandiva.Configuration, default None + Configuration for the projector. + + Returns + ------- + Projector instance + """ + cdef: + Expression child + c_vector[shared_ptr[CGandivaExpression]] c_children + shared_ptr[CProjector] result + + if configuration is None: + configuration = Configuration() + + for child in children: + if child is None: + raise TypeError("Expressions must not be None") + c_children.push_back(child.expression) + + check_status( + Projector_Make(schema.sp_schema, c_children, + _ensure_selection_mode(selection_mode), + configuration.configuration, + &result)) + return Projector.create(result, pool) + + +cpdef make_filter(Schema schema, Condition condition, + Configuration configuration=None): + """ + Construct a filter based on a condition. + + A filter is built for a specific schema and condition. Once the filter is + built, it can be used to evaluate many row batches. + + Parameters + ---------- + schema : pyarrow.Schema + Schema for the record batches, and the condition. + condition : pyarrow.gandiva.Condition + Filter condition. + configuration : pyarrow.gandiva.Configuration, default None + Configuration for the filter. + + Returns + ------- + Filter instance + """ + cdef shared_ptr[CFilter] result + if condition is None: + raise TypeError("Condition must not be None") + + if configuration is None: + configuration = Configuration() + + check_status( + Filter_Make(schema.sp_schema, condition.condition, configuration.configuration, &result)) + return Filter.create(result) + + +cdef class FunctionSignature(_Weakrefable): + """ + Signature of a Gandiva function including name, parameter types + and return type. + """ + + cdef: + shared_ptr[CFunctionSignature] signature + + def __init__(self): + raise TypeError("Do not call {}'s constructor directly." + .format(self.__class__.__name__)) + + @staticmethod + cdef create(shared_ptr[CFunctionSignature] signature): + cdef FunctionSignature self = FunctionSignature.__new__( + FunctionSignature) + self.signature = signature + return self + + def return_type(self): + return pyarrow_wrap_data_type(self.signature.get().ret_type()) + + def param_types(self): + result = [] + cdef vector[shared_ptr[CDataType]] types = \ + self.signature.get().param_types() + for t in types: + result.append(pyarrow_wrap_data_type(t)) + return result + + def name(self): + return self.signature.get().base_name().decode() + + def __repr__(self): + signature = self.signature.get().ToString().decode() + return "FunctionSignature(" + signature + ")" + + +def get_registered_function_signatures(): + """ + Return the function in Gandiva's ExpressionRegistry. + + Returns + ------- + registry: a list of registered function signatures + """ + results = [] + + cdef vector[shared_ptr[CFunctionSignature]] signatures = \ + GetRegisteredFunctionSignatures() + + for signature in signatures: + results.append(FunctionSignature.create(signature)) + + return results diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/accumulation_queue.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/accumulation_queue.h new file mode 100644 index 0000000000000000000000000000000000000000..a27b8b399ce475f614d6314e527847f8541ec155 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/accumulation_queue.h @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/compute/exec.h" +#include "arrow/result.h" + +namespace arrow { +namespace acero { +namespace util { + +using arrow::compute::ExecBatch; + +/// \brief A container that accumulates batches until they are ready to +/// be processed. +class AccumulationQueue { + public: + AccumulationQueue() : row_count_(0) {} + ~AccumulationQueue() = default; + + // We should never be copying ExecBatch around + AccumulationQueue(const AccumulationQueue&) = delete; + AccumulationQueue& operator=(const AccumulationQueue&) = delete; + + AccumulationQueue(AccumulationQueue&& that); + AccumulationQueue& operator=(AccumulationQueue&& that); + + void Concatenate(AccumulationQueue&& that); + void InsertBatch(ExecBatch batch); + int64_t row_count() { return row_count_; } + size_t batch_count() { return batches_.size(); } + bool empty() const { return batches_.empty(); } + void Clear(); + ExecBatch& operator[](size_t i); + + private: + int64_t row_count_; + std::vector batches_; +}; + +/// A queue that sequences incoming batches +/// +/// This can be used when a node needs to do some kind of ordered processing on +/// the stream. +/// +/// Batches can be inserted in any order. The process_callback will be called on +/// the batches, in order, without reentrant calls. For this reason the callback +/// should be quick. +/// +/// For example, in a top-n node, the process callback should determine how many +/// rows need to be delivered for the given batch, and then return a task to actually +/// deliver those rows. +class SequencingQueue { + public: + using Task = std::function; + + /// Strategy that describes how to handle items + class Processor { + public: + /// Process the batch, potentially generating a task + /// + /// This method will be called on each batch in order. Calls to this method + /// will be serialized and it will not be called reentrantly. This makes it + /// safe to do things that rely on order but minimal time should be spent here + /// to avoid becoming a bottleneck. + /// + /// \return a follow-up task that will be scheduled. The follow-up task(s) are + /// is not guaranteed to run in any particular order. If nullopt is + /// returned then nothing will be scheduled. + virtual Result> Process(ExecBatch batch) = 0; + /// Schedule a task + virtual void Schedule(Task task) = 0; + }; + + virtual ~SequencingQueue() = default; + + /// Insert a batch into the queue + /// + /// This will insert the batch into the queue. If this batch was the next batch + /// to deliver then this will trigger 1+ calls to the process callback to generate + /// 1+ tasks. + /// + /// The task generated by this call will be executed immediately. The remaining + /// tasks will be scheduled using the schedule callback. + /// + /// From a data pipeline perspective the sequencing queue is a "sometimes" breaker. If + /// a task arrives in order then this call will usually execute the downstream pipeline. + /// If this task arrives early then this call will only queue the data. + virtual Status InsertBatch(ExecBatch batch) = 0; + + /// Create a queue + /// \param processor describes how to process the batches, must outlive the queue + static std::unique_ptr Make(Processor* processor); +}; + +/// A queue that sequences incoming batches +/// +/// Unlike SequencingQueue the Process method is not expected to schedule new tasks. +/// +/// If a batch arrives and another thread is currently processing then the batch +/// will be queued and control will return. In other words, delivery of batches will +/// not block on the Process method. +/// +/// It can be helpful to think of this as if a dedicated thread is running Process as +/// batches arrive +class SerialSequencingQueue { + public: + /// Strategy that describes how to handle items + class Processor { + public: + /// Process the batch + /// + /// This method will be called on each batch in order. Calls to this method + /// will be serialized and it will not be called reentrantly. This makes it + /// safe to do things that rely on order. + /// + /// If this falls behind then data may accumulate + /// + /// TODO: Could add backpressure if needed but right now all uses of this should + /// be pretty fast and so are unlikely to block. + virtual Status Process(ExecBatch batch) = 0; + }; + + virtual ~SerialSequencingQueue() = default; + + /// Insert a batch into the queue + /// + /// This will insert the batch into the queue. If this batch was the next batch + /// to deliver then this may trigger calls to the processor which will be run + /// as part of this call. + virtual Status InsertBatch(ExecBatch batch) = 0; + + /// Create a queue + /// \param processor describes how to process the batches, must outlive the queue + static std::unique_ptr Make(Processor* processor); +}; + +} // namespace util +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/aggregate_node.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/aggregate_node.h new file mode 100644 index 0000000000000000000000000000000000000000..790264b2083052c4623e52718f569a65451475d9 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/aggregate_node.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include + +#include "arrow/acero/visibility.h" +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" + +namespace arrow { +namespace acero { +namespace aggregate { + +using compute::Aggregate; +using compute::default_exec_context; +using compute::ExecContext; + +/// \brief Make the output schema of an aggregate node +/// +/// The output schema is determined by the aggregation kernels, which may depend on the +/// ExecContext argument. To guarantee correct results, the same ExecContext argument +/// should be used in execution. +/// +/// \param[in] input_schema the schema of the input to the node +/// \param[in] keys the grouping keys for the aggregation +/// \param[in] segment_keys the segmenting keys for the aggregation +/// \param[in] aggregates the aggregates for the aggregation +/// \param[in] exec_ctx the execution context for the aggregation +ARROW_ACERO_EXPORT Result> MakeOutputSchema( + const std::shared_ptr& input_schema, const std::vector& keys, + const std::vector& segment_keys, const std::vector& aggregates, + ExecContext* exec_ctx = default_exec_context()); + +} // namespace aggregate +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/api.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/api.h new file mode 100644 index 0000000000000000000000000000000000000000..c9724fd512d0b56dfa3a24647b3885677c92b534 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/api.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle + +#pragma once + +/// \defgroup acero-api Utilities for creating and executing execution plans +/// @{ +/// @} + +/// \defgroup acero-nodes Options classes for the various exec nodes +/// @{ +/// @} + +#include "arrow/acero/exec_plan.h" +#include "arrow/acero/options.h" diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/asof_join_node.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/asof_join_node.h new file mode 100644 index 0000000000000000000000000000000000000000..6a0ce8fd386b01ac868bac3d4d026a309e351cb3 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/asof_join_node.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/acero/options.h" +#include "arrow/acero/visibility.h" +#include "arrow/compute/exec.h" +#include "arrow/type.h" + +namespace arrow { +namespace acero { +namespace asofjoin { + +using AsofJoinKeys = AsofJoinNodeOptions::Keys; + +/// \brief Make the output schema of an as-of-join node +/// +/// \param[in] input_schema the schema of each input to the node +/// \param[in] input_keys the key of each input to the node +ARROW_ACERO_EXPORT Result> MakeOutputSchema( + const std::vector>& input_schema, + const std::vector& input_keys); + +} // namespace asofjoin +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/backpressure_handler.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/backpressure_handler.h new file mode 100644 index 0000000000000000000000000000000000000000..db6c3799354af4badcee8c88a77b0d67799bb906 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/backpressure_handler.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include "arrow/acero/exec_plan.h" +#include "arrow/acero/options.h" + +#include + +namespace arrow::acero { + +class BackpressureHandler { + private: + BackpressureHandler(ExecNode* input, size_t low_threshold, size_t high_threshold, + std::unique_ptr backpressure_control) + : input_(input), + low_threshold_(low_threshold), + high_threshold_(high_threshold), + backpressure_control_(std::move(backpressure_control)) {} + + public: + static Result Make( + ExecNode* input, size_t low_threshold, size_t high_threshold, + std::unique_ptr backpressure_control) { + if (low_threshold >= high_threshold) { + return Status::Invalid("low threshold (", low_threshold, + ") must be less than high threshold (", high_threshold, ")"); + } + if (backpressure_control == NULLPTR) { + return Status::Invalid("null backpressure control parameter"); + } + BackpressureHandler backpressure_handler(input, low_threshold, high_threshold, + std::move(backpressure_control)); + return backpressure_handler; + } + + void Handle(size_t start_level, size_t end_level) { + if (start_level < high_threshold_ && end_level >= high_threshold_) { + backpressure_control_->Pause(); + } else if (start_level > low_threshold_ && end_level <= low_threshold_) { + backpressure_control_->Resume(); + } + } + + Status ForceShutdown() { + // It may be unintuitive to call Resume() here, but this is to avoid a deadlock. + // Since acero's executor won't terminate if any one node is paused, we need to + // force resume the node before stopping production. + backpressure_control_->Resume(); + return input_->StopProducing(); + } + + private: + ExecNode* input_; + size_t low_threshold_; + size_t high_threshold_; + std::unique_ptr backpressure_control_; +}; + +} // namespace arrow::acero diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/benchmark_util.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/benchmark_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0ba8553887c03f876b6e08f031f5641170c2e09f --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/benchmark_util.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "benchmark/benchmark.h" + +#include "arrow/acero/exec_plan.h" +#include "arrow/acero/test_util_internal.h" +#include "arrow/compute/exec.h" + +namespace arrow { + +namespace acero { + +Status BenchmarkNodeOverhead(benchmark::State& state, int32_t num_batches, + int32_t batch_size, arrow::acero::BatchesWithSchema data, + std::vector& node_declarations, + arrow::MemoryPool* pool = default_memory_pool()); + +Status BenchmarkIsolatedNodeOverhead(benchmark::State& state, + arrow::compute::Expression expr, int32_t num_batches, + int32_t batch_size, + arrow::acero::BatchesWithSchema data, + std::string factory_name, + arrow::acero::ExecNodeOptions& options, + arrow::MemoryPool* pool = default_memory_pool()); + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/bloom_filter.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/bloom_filter.h new file mode 100644 index 0000000000000000000000000000000000000000..8f9fe171baeb39f5347d112921666ba057cb56b6 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/bloom_filter.h @@ -0,0 +1,323 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/acero/partition_util.h" +#include "arrow/acero/util.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/simd.h" + +namespace arrow { +namespace acero { + +// A set of pre-generated bit masks from a 64-bit word. +// +// It is used to map selected bits of hash to a bit mask that will be used in +// a Bloom filter. +// +// These bit masks need to look random and need to have a similar fractions of +// bits set in order for a Bloom filter to have a low false positives rate. +// +struct ARROW_ACERO_EXPORT BloomFilterMasks { + // Generate all masks as a single bit vector. Each bit offset in this bit + // vector corresponds to a single mask. + // In each consecutive kBitsPerMask bits, there must be between + // kMinBitsSet and kMaxBitsSet bits set. + // + BloomFilterMasks(); + + inline uint64_t mask(int bit_offset) { +#if ARROW_LITTLE_ENDIAN + return (arrow::util::SafeLoadAs(masks_ + bit_offset / 8) >> + (bit_offset % 8)) & + kFullMask; +#else + return (BYTESWAP(arrow::util::SafeLoadAs(masks_ + bit_offset / 8)) >> + (bit_offset % 8)) & + kFullMask; +#endif + } + + // Masks are 57 bits long because then they can be accessed at an + // arbitrary bit offset using a single unaligned 64-bit load instruction. + // + static constexpr int kBitsPerMask = 57; + static constexpr uint64_t kFullMask = (1ULL << kBitsPerMask) - 1; + + // Minimum and maximum number of bits set in each mask. + // This constraint is enforced when generating the bit masks. + // Values should be close to each other and chosen as to minimize a Bloom + // filter false positives rate. + // + static constexpr int kMinBitsSet = 4; + static constexpr int kMaxBitsSet = 5; + + // Number of generated masks. + // Having more masks to choose will improve false positives rate of Bloom + // filter but will also use more memory, which may lead to more CPU cache + // misses. + // The chosen value results in using only a few cache-lines for mask lookups, + // while providing a good variety of available bit masks. + // + static constexpr int kLogNumMasks = 10; + static constexpr int kNumMasks = 1 << kLogNumMasks; + + // Data of masks. Masks are stored in a single bit vector. Nth mask is + // kBitsPerMask bits starting at bit offset N. + // + static constexpr int kTotalBytes = (kNumMasks + 64) / 8; + uint8_t masks_[kTotalBytes]; +}; + +// A variant of a blocked Bloom filter implementation. +// A Bloom filter is a data structure that provides approximate membership test +// functionality based only on the hash of the key. Membership test may return +// false positives but not false negatives. Approximation of the result allows +// in general case (for arbitrary data types of keys) to save on both memory and +// lookup cost compared to the accurate membership test. +// The accurate test may sometimes still be cheaper for a specific data types +// and inputs, e.g. integers from a small range. +// +// This blocked Bloom filter is optimized for use in hash joins, to achieve a +// good balance between the size of the filter, the cost of its building and +// querying and the rate of false positives. +// +class ARROW_ACERO_EXPORT BlockedBloomFilter { + friend class BloomFilterBuilder_SingleThreaded; + friend class BloomFilterBuilder_Parallel; + + public: + BlockedBloomFilter() : log_num_blocks_(0), num_blocks_(0), blocks_(NULLPTR) {} + + inline bool Find(uint64_t hash) const { + uint64_t m = mask(hash); + uint64_t b = blocks_[block_id(hash)]; + return (b & m) == m; + } + + // Uses SIMD if available for smaller Bloom filters. + // Uses memory prefetching for larger Bloom filters. + // + void Find(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes, + uint8_t* result_bit_vector, bool enable_prefetch = true) const; + void Find(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes, + uint8_t* result_bit_vector, bool enable_prefetch = true) const; + + int log_num_blocks() const { return log_num_blocks_; } + + int NumHashBitsUsed() const; + + bool IsSameAs(const BlockedBloomFilter* other) const; + + int64_t NumBitsSet() const; + + // Folding of a block Bloom filter after the initial version + // has been built. + // + // One of the parameters for creation of Bloom filter is the number + // of bits allocated for it. The more bits allocated, the lower the + // probability of false positives. A good heuristic is to aim for + // half of the bits set in the constructed Bloom filter. This should + // result in a good trade off between size (and following cost of + // memory accesses) and false positives rate. + // + // There might have been many duplicate keys in the input provided + // to Bloom filter builder. In that case the resulting bit vector + // would be more sparse then originally intended. It is possible to + // easily correct that and cut in half the size of Bloom filter + // after it has already been constructed. The process to do that is + // approximately equal to OR-ing bits from upper and lower half (the + // way we address these bits when inserting or querying a hash makes + // such folding in half possible). + // + // We will keep folding as long as the fraction of bits set is less + // than 1/4. The resulting bit vector density should be in the [1/4, + // 1/2) range. + // + void Fold(); + + private: + Status CreateEmpty(int64_t num_rows_to_insert, MemoryPool* pool); + + inline void Insert(uint64_t hash) { + uint64_t m = mask(hash); + uint64_t& b = blocks_[block_id(hash)]; + b |= m; + } + + void Insert(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes); + void Insert(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes); + + inline uint64_t mask(uint64_t hash) const { + // The lowest bits of hash are used to pick mask index. + // + int mask_id = static_cast(hash & (BloomFilterMasks::kNumMasks - 1)); + uint64_t result = masks_.mask(mask_id); + + // The next set of hash bits is used to pick the amount of bit + // rotation of the mask. + // + int rotation = (hash >> BloomFilterMasks::kLogNumMasks) & 63; + result = ROTL64(result, rotation); + + return result; + } + + inline int64_t block_id(uint64_t hash) const { + // The next set of hash bits following the bits used to select a + // mask is used to pick block id (index of 64-bit word in a bit + // vector). + // + return (hash >> (BloomFilterMasks::kLogNumMasks + 6)) & (num_blocks_ - 1); + } + + template + inline void InsertImp(int64_t num_rows, const T* hashes); + + template + inline void FindImp(int64_t num_rows, const T* hashes, uint8_t* result_bit_vector, + bool enable_prefetch) const; + + void SingleFold(int num_folds); + +#if defined(ARROW_HAVE_RUNTIME_AVX2) + inline __m256i mask_avx2(__m256i hash) const; + inline __m256i block_id_avx2(__m256i hash) const; + int64_t Insert_avx2(int64_t num_rows, const uint32_t* hashes); + int64_t Insert_avx2(int64_t num_rows, const uint64_t* hashes); + template + int64_t InsertImp_avx2(int64_t num_rows, const T* hashes); + int64_t Find_avx2(int64_t num_rows, const uint32_t* hashes, + uint8_t* result_bit_vector) const; + int64_t Find_avx2(int64_t num_rows, const uint64_t* hashes, + uint8_t* result_bit_vector) const; + template + int64_t FindImp_avx2(int64_t num_rows, const T* hashes, + uint8_t* result_bit_vector) const; +#endif + + bool UsePrefetch() const { + return num_blocks_ * sizeof(uint64_t) > kPrefetchLimitBytes; + } + + static constexpr int64_t kPrefetchLimitBytes = 256 * 1024; + + static BloomFilterMasks masks_; + + // Total number of bits used by block Bloom filter must be a power + // of 2. + // + int log_num_blocks_; + int64_t num_blocks_; + + // Buffer allocated to store an array of power of 2 64-bit blocks. + // + std::shared_ptr buf_; + // Pointer to mutable data owned by Buffer + // + uint64_t* blocks_; +}; + +// We have two separate implementations of building a Bloom filter, multi-threaded and +// single-threaded. +// +// Single threaded version is useful in two ways: +// a) It allows to verify parallel implementation in tests (the single threaded one is +// simpler and can be used as the source of truth). +// b) It is preferred for small and medium size Bloom filters, because it skips extra +// synchronization related steps from parallel variant (partitioning and taking locks). +// +enum class BloomFilterBuildStrategy { + SINGLE_THREADED = 0, + PARALLEL = 1, +}; + +class ARROW_ACERO_EXPORT BloomFilterBuilder { + public: + virtual ~BloomFilterBuilder() = default; + virtual Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool, + int64_t num_rows, int64_t num_batches, + BlockedBloomFilter* build_target) = 0; + virtual int64_t num_tasks() const { return 0; } + virtual Status PushNextBatch(size_t thread_index, int64_t num_rows, + const uint32_t* hashes) = 0; + virtual Status PushNextBatch(size_t thread_index, int64_t num_rows, + const uint64_t* hashes) = 0; + virtual void CleanUp() {} + static std::unique_ptr Make(BloomFilterBuildStrategy strategy); +}; + +class ARROW_ACERO_EXPORT BloomFilterBuilder_SingleThreaded : public BloomFilterBuilder { + public: + Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool, + int64_t num_rows, int64_t num_batches, + BlockedBloomFilter* build_target) override; + + Status PushNextBatch(size_t /*thread_index*/, int64_t num_rows, + const uint32_t* hashes) override; + + Status PushNextBatch(size_t /*thread_index*/, int64_t num_rows, + const uint64_t* hashes) override; + + private: + template + void PushNextBatchImp(int64_t num_rows, const T* hashes); + + int64_t hardware_flags_; + BlockedBloomFilter* build_target_; +}; + +class ARROW_ACERO_EXPORT BloomFilterBuilder_Parallel : public BloomFilterBuilder { + public: + Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool, + int64_t num_rows, int64_t num_batches, + BlockedBloomFilter* build_target) override; + + Status PushNextBatch(size_t thread_id, int64_t num_rows, + const uint32_t* hashes) override; + + Status PushNextBatch(size_t thread_id, int64_t num_rows, + const uint64_t* hashes) override; + + void CleanUp() override; + + private: + template + void PushNextBatchImp(size_t thread_id, int64_t num_rows, const T* hashes); + + int64_t hardware_flags_; + BlockedBloomFilter* build_target_; + int log_num_prtns_; + struct ThreadLocalState { + std::vector partitioned_hashes_32; + std::vector partitioned_hashes_64; + std::vector partition_ranges; + std::vector unprocessed_partition_ids; + }; + std::vector thread_local_states_; + PartitionLocks prtn_locks_; +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/exec_plan.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/exec_plan.h new file mode 100644 index 0000000000000000000000000000000000000000..dba6c64ddc8379f7a8e6aa666f55555ced6c78aa --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/exec_plan.h @@ -0,0 +1,819 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/acero/type_fwd.h" +#include "arrow/acero/visibility.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/exec.h" +#include "arrow/compute/ordering.h" +#include "arrow/type_fwd.h" +#include "arrow/util/future.h" +#include "arrow/util/macros.h" +#include "arrow/util/tracing.h" +#include "arrow/util/type_fwd.h" + +namespace arrow { + +using compute::ExecBatch; +using compute::ExecContext; +using compute::FunctionRegistry; +using compute::GetFunctionRegistry; +using compute::Ordering; +using compute::threaded_exec_context; + +namespace acero { + +/// \addtogroup acero-internals +/// @{ + +class ARROW_ACERO_EXPORT ExecPlan : public std::enable_shared_from_this { + public: + // This allows operators to rely on signed 16-bit indices + static const uint32_t kMaxBatchSize = 1 << 15; + using NodeVector = std::vector; + + virtual ~ExecPlan() = default; + + QueryContext* query_context(); + + /// \brief retrieve the nodes in the plan + const NodeVector& nodes() const; + + /// Make an empty exec plan + static Result> Make( + QueryOptions options, ExecContext exec_context = *threaded_exec_context(), + std::shared_ptr metadata = NULLPTR); + + static Result> Make( + ExecContext exec_context = *threaded_exec_context(), + std::shared_ptr metadata = NULLPTR); + + static Result> Make( + QueryOptions options, ExecContext* exec_context, + std::shared_ptr metadata = NULLPTR); + + static Result> Make( + ExecContext* exec_context, + std::shared_ptr metadata = NULLPTR); + + ExecNode* AddNode(std::unique_ptr node); + + template + Node* EmplaceNode(Args&&... args) { + std::unique_ptr node{new Node{std::forward(args)...}}; + auto out = node.get(); + AddNode(std::move(node)); + return out; + } + + Status Validate(); + + /// \brief Start producing on all nodes + /// + /// Nodes are started in reverse topological order, such that any node + /// is started before all of its inputs. + void StartProducing(); + + /// \brief Stop producing on all nodes + /// + /// Triggers all sources to stop producing new data. In order to cleanly stop the plan + /// will continue to run any tasks that are already in progress. The caller should + /// still wait for `finished` to complete before destroying the plan. + void StopProducing(); + + /// \brief A future which will be marked finished when all tasks have finished. + Future<> finished(); + + /// \brief Return whether the plan has non-empty metadata + bool HasMetadata() const; + + /// \brief Return the plan's attached metadata + std::shared_ptr metadata() const; + + std::string ToString() const; +}; + +// Acero can be extended by providing custom implementations of ExecNode. The methods +// below are documented in detail and provide careful instruction on how to fulfill the +// ExecNode contract. It's suggested you familiarize yourself with the Acero +// documentation in the C++ user guide. +class ARROW_ACERO_EXPORT ExecNode { + public: + using NodeVector = std::vector; + + virtual ~ExecNode() = default; + + virtual const char* kind_name() const = 0; + + // The number of inputs expected by this node + int num_inputs() const { return static_cast(inputs_.size()); } + + /// This node's predecessors in the exec plan + const NodeVector& inputs() const { return inputs_; } + + /// True if the plan has no output schema (is a sink) + bool is_sink() const { return !output_schema_; } + + /// \brief Labels identifying the function of each input. + const std::vector& input_labels() const { return input_labels_; } + + /// This node's successor in the exec plan + const ExecNode* output() const { return output_; } + + /// The datatypes for batches produced by this node + const std::shared_ptr& output_schema() const { return output_schema_; } + + /// This node's exec plan + ExecPlan* plan() { return plan_; } + + /// \brief An optional label, for display and debugging + /// + /// There is no guarantee that this value is non-empty or unique. + const std::string& label() const { return label_; } + void SetLabel(std::string label) { label_ = std::move(label); } + + virtual Status Validate() const; + + /// \brief the ordering of the output batches + /// + /// This does not guarantee the batches will be emitted by this node + /// in order. Instead it guarantees that the batches will have their + /// ExecBatch::index property set in a way that respects this ordering. + /// + /// In other words, given the ordering {{"x", SortOrder::Ascending}} we + /// know that all values of x in a batch with index N will be less than + /// or equal to all values of x in a batch with index N+k (assuming k > 0). + /// Furthermore, we also know that values will be sorted within a batch. + /// Any row N will have a value of x that is less than the value for + /// any row N+k. + /// + /// Note that an ordering can be both Ordering::Unordered and Ordering::Implicit. + /// A node's output should be marked Ordering::Unordered if the order is + /// non-deterministic. For example, a hash-join has no predictable output order. + /// + /// If the ordering is Ordering::Implicit then there is a meaningful order but that + /// ordering is not represented by any column in the data. The most common case for + /// this is when reading data from an in-memory table. The data has an implicit "row + /// order" which is not necessarily represented in the data set. + /// + /// A filter or project node will not modify the ordering. Nothing needs to be done + /// other than ensure the index assigned to output batches is the same as the + /// input batch that was mapped. + /// + /// Other nodes may introduce order. For example, an order-by node will emit + /// a brand new ordering independent of the input ordering. + /// + /// Finally, as described above, such as a hash-join or aggregation may may + /// destroy ordering (although these nodes could also choose to establish a + /// new ordering based on the hash keys). + /// + /// Some nodes will require an ordering. For example, a fetch node or an + /// asof join node will only function if the input data is ordered (for fetch + /// it is enough to be implicitly ordered. For an asof join the ordering must + /// be explicit and compatible with the on key.) + /// + /// Nodes that maintain ordering should be careful to avoid introducing gaps + /// in the batch index. This may require emitting empty batches in order to + /// maintain continuity. + virtual const Ordering& ordering() const; + + /// Upstream API: + /// These functions are called by input nodes that want to inform this node + /// about an updated condition (a new input batch or an impending + /// end of stream). + /// + /// Implementation rules: + /// - these may be called anytime after StartProducing() has succeeded + /// (and even during or after StopProducing()) + /// - these may be called concurrently + /// - these are allowed to call back into PauseProducing(), ResumeProducing() + /// and StopProducing() + + /// Transfer input batch to ExecNode + /// + /// A node will typically perform some kind of operation on the batch + /// and then call InputReceived on its outputs with the result. + /// + /// Other nodes may need to accumulate some number of inputs before any + /// output can be produced. These nodes will add the batch to some kind + /// of in-memory accumulation queue and return. + virtual Status InputReceived(ExecNode* input, ExecBatch batch) = 0; + + /// Mark the inputs finished after the given number of batches. + /// + /// This may be called before all inputs are received. This simply fixes + /// the total number of incoming batches for an input, so that the ExecNode + /// knows when it has received all input, regardless of order. + virtual Status InputFinished(ExecNode* input, int total_batches) = 0; + + /// \brief Perform any needed initialization + /// + /// This hook performs any actions in between creation of ExecPlan and the call to + /// StartProducing. An example could be Bloom filter pushdown. The order of ExecNodes + /// that executes this method is undefined, but the calls are made synchronously. + /// + /// At this point a node can rely on all inputs & outputs (and the input schemas) + /// being well defined. + virtual Status Init(); + + /// Lifecycle API: + /// - start / stop to initiate and terminate production + /// - pause / resume to apply backpressure + /// + /// Implementation rules: + /// - StartProducing() should not recurse into the inputs, as it is + /// handled by ExecPlan::StartProducing() + /// - PauseProducing(), ResumeProducing(), StopProducing() may be called + /// concurrently, potentially even before the call to StartProducing + /// has finished. + /// - PauseProducing(), ResumeProducing(), StopProducing() may be called + /// by the downstream nodes' InputReceived(), InputFinished() methods + /// + /// StopProducing may be called due to an error, by the user (e.g. cancel), or + /// because a node has all the data it needs (e.g. limit, top-k on sorted data). + /// This means the method may be called multiple times and we have the following + /// additional rules + /// - StopProducing() must be idempotent + /// - StopProducing() must be forwarded to inputs (this is needed for the limit/top-k + /// case because we may not be stopping the entire plan) + + // Right now, since synchronous calls happen in both directions (input to + // output and then output to input), a node must be careful to be reentrant + // against synchronous calls from its output, *and* also concurrent calls from + // other threads. The most reliable solution is to update the internal state + // first, and notify outputs only at the end. + // + // Concurrent calls to PauseProducing and ResumeProducing can be hard to sequence + // as they may travel at different speeds through the plan. + // + // For example, consider a resume that comes quickly after a pause. If the source + // receives the resume before the pause the source may think the destination is full + // and halt production which would lead to deadlock. + // + // To resolve this a counter is sent for all calls to pause/resume. Only the call with + // the highest counter value is valid. So if a call to PauseProducing(5) comes after + // a call to ResumeProducing(6) then the source should continue producing. + + /// \brief Start producing + /// + /// This must only be called once. + /// + /// This is typically called automatically by ExecPlan::StartProducing(). + virtual Status StartProducing() = 0; + + /// \brief Pause producing temporarily + /// + /// \param output Pointer to the output that is full + /// \param counter Counter used to sequence calls to pause/resume + /// + /// This call is a hint that an output node is currently not willing + /// to receive data. + /// + /// This may be called any number of times. + /// However, the node is still free to produce data (which may be difficult + /// to prevent anyway if data is produced using multiple threads). + virtual void PauseProducing(ExecNode* output, int32_t counter) = 0; + + /// \brief Resume producing after a temporary pause + /// + /// \param output Pointer to the output that is now free + /// \param counter Counter used to sequence calls to pause/resume + /// + /// This call is a hint that an output node is willing to receive data again. + /// + /// This may be called any number of times. + virtual void ResumeProducing(ExecNode* output, int32_t counter) = 0; + + /// \brief Stop producing new data + /// + /// If this node is a source then the source should stop generating data + /// as quickly as possible. If this node is not a source then there is typically + /// nothing that needs to be done although a node may choose to start ignoring incoming + /// data. + /// + /// This method will be called when an error occurs in the plan + /// This method may also be called by the user if they wish to end a plan early + /// Finally, this method may be called if a node determines it no longer needs any more + /// input (for example, a limit node). + /// + /// This method may be called multiple times. + /// + /// This is not a pause. There will be no way to start the source again after this has + /// been called. + virtual Status StopProducing(); + + std::string ToString(int indent = 0) const; + + protected: + ExecNode(ExecPlan* plan, NodeVector inputs, std::vector input_labels, + std::shared_ptr output_schema); + + virtual Status StopProducingImpl() = 0; + + /// Provide extra info to include in the string representation. + virtual std::string ToStringExtra(int indent = 0) const; + + std::atomic stopped_; + ExecPlan* plan_; + std::string label_; + + NodeVector inputs_; + std::vector input_labels_; + + std::shared_ptr output_schema_; + ExecNode* output_ = NULLPTR; +}; + +/// \brief An extensible registry for factories of ExecNodes +class ARROW_ACERO_EXPORT ExecFactoryRegistry { + public: + using Factory = std::function(ExecPlan*, std::vector, + const ExecNodeOptions&)>; + + virtual ~ExecFactoryRegistry() = default; + + /// \brief Get the named factory from this registry + /// + /// will raise if factory_name is not found + virtual Result GetFactory(const std::string& factory_name) = 0; + + /// \brief Add a factory to this registry with the provided name + /// + /// will raise if factory_name is already in the registry + virtual Status AddFactory(std::string factory_name, Factory factory) = 0; +}; + +/// The default registry, which includes built-in factories. +ARROW_ACERO_EXPORT +ExecFactoryRegistry* default_exec_factory_registry(); + +/// \brief Construct an ExecNode using the named factory +inline Result MakeExecNode( + const std::string& factory_name, ExecPlan* plan, std::vector inputs, + const ExecNodeOptions& options, + ExecFactoryRegistry* registry = default_exec_factory_registry()) { + ARROW_ASSIGN_OR_RAISE(auto factory, registry->GetFactory(factory_name)); + return factory(plan, std::move(inputs), options); +} + +/// @} + +/// \addtogroup acero-api +/// @{ + +/// \brief Helper class for declaring execution nodes +/// +/// A Declaration represents an unconstructed ExecNode (and potentially an entire graph +/// since its inputs may also be Declarations) +/// +/// A Declaration can be converted to a plan and executed using one of the +/// DeclarationToXyz methods. +/// +/// For more direct control, a Declaration can be added to an existing execution +/// plan with Declaration::AddToPlan, which will recursively construct any inputs as +/// necessary. +struct ARROW_ACERO_EXPORT Declaration { + using Input = std::variant; + + Declaration() {} + + /// \brief construct a declaration + /// \param factory_name the name of the exec node to construct. The node must have + /// been added to the exec node registry with this name. + /// \param inputs the inputs to the node, these should be other declarations + /// \param options options that control the behavior of the node. You must use + /// the appropriate subclass. For example, if `factory_name` is + /// "project" then `options` should be ProjectNodeOptions. + /// \param label a label to give the node. Can be used to distinguish it from other + /// nodes of the same type in the plan. + Declaration(std::string factory_name, std::vector inputs, + std::shared_ptr options, std::string label) + : factory_name{std::move(factory_name)}, + inputs{std::move(inputs)}, + options{std::move(options)}, + label{std::move(label)} {} + + template + Declaration(std::string factory_name, std::vector inputs, Options options, + std::string label) + : Declaration{std::move(factory_name), std::move(inputs), + std::shared_ptr( + std::make_shared(std::move(options))), + std::move(label)} {} + + template + Declaration(std::string factory_name, std::vector inputs, Options options) + : Declaration{std::move(factory_name), std::move(inputs), std::move(options), + /*label=*/""} {} + + template + Declaration(std::string factory_name, Options options) + : Declaration{std::move(factory_name), {}, std::move(options), /*label=*/""} {} + + template + Declaration(std::string factory_name, Options options, std::string label) + : Declaration{std::move(factory_name), {}, std::move(options), std::move(label)} {} + + /// \brief Convenience factory for the common case of a simple sequence of nodes. + /// + /// Each of decls will be appended to the inputs of the subsequent declaration, + /// and the final modified declaration will be returned. + /// + /// Without this convenience factory, constructing a sequence would require explicit, + /// difficult-to-read nesting: + /// + /// Declaration{"n3", + /// { + /// Declaration{"n2", + /// { + /// Declaration{"n1", + /// { + /// Declaration{"n0", N0Opts{}}, + /// }, + /// N1Opts{}}, + /// }, + /// N2Opts{}}, + /// }, + /// N3Opts{}}; + /// + /// An equivalent Declaration can be constructed more tersely using Sequence: + /// + /// Declaration::Sequence({ + /// {"n0", N0Opts{}}, + /// {"n1", N1Opts{}}, + /// {"n2", N2Opts{}}, + /// {"n3", N3Opts{}}, + /// }); + static Declaration Sequence(std::vector decls); + + /// \brief add the declaration to an already created execution plan + /// \param plan the plan to add the node to + /// \param registry the registry to use to lookup the node factory + /// + /// This method will recursively call AddToPlan on all of the declaration's inputs. + /// This method is only for advanced use when the DeclarationToXyz methods are not + /// sufficient. + /// + /// \return the instantiated execution node + Result AddToPlan(ExecPlan* plan, ExecFactoryRegistry* registry = + default_exec_factory_registry()) const; + + // Validate a declaration + bool IsValid(ExecFactoryRegistry* registry = default_exec_factory_registry()) const; + + /// \brief the name of the factory to use when creating a node + std::string factory_name; + /// \brief the declarations's inputs + std::vector inputs; + /// \brief options to control the behavior of the node + std::shared_ptr options; + /// \brief a label to give the node in the plan + std::string label; +}; + +/// \brief How to handle unaligned buffers +enum class UnalignedBufferHandling { kWarn, kIgnore, kReallocate, kError }; + +/// \brief get the default behavior of unaligned buffer handling +/// +/// This is configurable via the ACERO_ALIGNMENT_HANDLING environment variable which +/// can be set to "warn", "ignore", "reallocate", or "error". If the environment +/// variable is not set, or is set to an invalid value, this will return kWarn +UnalignedBufferHandling GetDefaultUnalignedBufferHandling(); + +/// \brief plan-wide options that can be specified when executing an execution plan +struct ARROW_ACERO_EXPORT QueryOptions { + /// \brief Should the plan use a legacy batching strategy + /// + /// This is currently in place only to support the Scanner::ToTable + /// method. This method relies on batch indices from the scanner + /// remaining consistent. This is impractical in the ExecPlan which + /// might slice batches as needed (e.g. for a join) + /// + /// However, it still works for simple plans and this is the only way + /// we have at the moment for maintaining implicit order. + bool use_legacy_batching = false; + + /// If the output has a meaningful order then sequence the output of the plan + /// + /// The default behavior (std::nullopt) will sequence output batches if there + /// is a meaningful ordering in the final node and will emit batches immediately + /// otherwise. + /// + /// If explicitly set to true then plan execution will fail if there is no + /// meaningful ordering. This can be useful to validate a query that should + /// be emitting ordered results. + /// + /// If explicitly set to false then batches will be emit immediately even if there + /// is a meaningful ordering. This could cause batches to be emit out of order but + /// may offer a small decrease to latency. + std::optional sequence_output = std::nullopt; + + /// \brief should the plan use multiple background threads for CPU-intensive work + /// + /// If this is false then all CPU work will be done on the calling thread. I/O tasks + /// will still happen on the I/O executor and may be multi-threaded (but should not use + /// significant CPU resources). + /// + /// Will be ignored if custom_cpu_executor is set + bool use_threads = true; + + /// \brief custom executor to use for CPU-intensive work + /// + /// Must be null or remain valid for the duration of the plan. If this is null then + /// a default thread pool will be chosen whose behavior will be controlled by + /// the `use_threads` option. + ::arrow::internal::Executor* custom_cpu_executor = NULLPTR; + + /// \brief custom executor to use for IO work + /// + /// Must be null or remain valid for the duration of the plan. If this is null then + /// the global io thread pool will be chosen whose behavior will be controlled by + /// the "ARROW_IO_THREADS" environment. + ::arrow::internal::Executor* custom_io_executor = NULLPTR; + + /// \brief a memory pool to use for allocations + /// + /// Must remain valid for the duration of the plan. + MemoryPool* memory_pool = default_memory_pool(); + + /// \brief a function registry to use for the plan + /// + /// Must remain valid for the duration of the plan. + FunctionRegistry* function_registry = GetFunctionRegistry(); + /// \brief the names of the output columns + /// + /// If this is empty then names will be generated based on the input columns + /// + /// If set then the number of names must equal the number of output columns + std::vector field_names; + + /// \brief Policy for unaligned buffers in source data + /// + /// Various compute functions and acero internals will type pun array + /// buffers from uint8_t* to some kind of value type (e.g. we might + /// cast to int32_t* to add two int32 arrays) + /// + /// If the buffer is poorly aligned (e.g. an int32 array is not aligned + /// on a 4-byte boundary) then this is technically undefined behavior in C++. + /// However, most modern compilers and CPUs are fairly tolerant of this + /// behavior and nothing bad (beyond a small hit to performance) is likely + /// to happen. + /// + /// Note that this only applies to source buffers. All buffers allocated internally + /// by Acero will be suitably aligned. + /// + /// If this field is set to kWarn then Acero will check if any buffers are unaligned + /// and, if they are, will emit a warning. + /// + /// If this field is set to kReallocate then Acero will allocate a new, suitably aligned + /// buffer and copy the contents from the old buffer into this new buffer. + /// + /// If this field is set to kError then Acero will gracefully abort the plan instead. + /// + /// If this field is set to kIgnore then Acero will not even check if the buffers are + /// unaligned. + /// + /// If this field is not set then it will be treated as kWarn unless overridden + /// by the ACERO_ALIGNMENT_HANDLING environment variable + std::optional unaligned_buffer_handling; +}; + +/// \brief Calculate the output schema of a declaration +/// +/// This does not actually execute the plan. This operation may fail if the +/// declaration represents an invalid plan (e.g. a project node with multiple inputs) +/// +/// \param declaration A declaration describing an execution plan +/// \param function_registry The function registry to use for function execution. If null +/// then the default function registry will be used. +/// +/// \return the schema that batches would have after going through the execution plan +ARROW_ACERO_EXPORT Result> DeclarationToSchema( + const Declaration& declaration, FunctionRegistry* function_registry = NULLPTR); + +/// \brief Create a string representation of a plan +/// +/// This representation is for debug purposes only. +/// +/// Conversion to a string may fail if the declaration represents an +/// invalid plan. +/// +/// Use Substrait for complete serialization of plans +/// +/// \param declaration A declaration describing an execution plan +/// \param function_registry The function registry to use for function execution. If null +/// then the default function registry will be used. +/// +/// \return a string representation of the plan suitable for debugging output +ARROW_ACERO_EXPORT Result DeclarationToString( + const Declaration& declaration, FunctionRegistry* function_registry = NULLPTR); + +/// \brief Utility method to run a declaration and collect the results into a table +/// +/// \param declaration A declaration describing the plan to run +/// \param use_threads If `use_threads` is false then all CPU work will be done on the +/// calling thread. I/O tasks will still happen on the I/O executor +/// and may be multi-threaded (but should not use significant CPU +/// resources). +/// \param memory_pool The memory pool to use for allocations made while running the plan. +/// \param function_registry The function registry to use for function execution. If null +/// then the default function registry will be used. +/// +/// This method will add a sink node to the declaration to collect results into a +/// table. It will then create an ExecPlan from the declaration, start the exec plan, +/// block until the plan has finished, and return the created table. +ARROW_ACERO_EXPORT Result> DeclarationToTable( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +ARROW_ACERO_EXPORT Result> DeclarationToTable( + Declaration declaration, QueryOptions query_options); + +/// \brief Asynchronous version of \see DeclarationToTable +/// +/// \param declaration A declaration describing the plan to run +/// \param use_threads The behavior of use_threads is slightly different than the +/// synchronous version since we cannot run synchronously on the +/// calling thread. Instead, if use_threads=false then a new thread +/// pool will be created with a single thread and this will be used for +/// all compute work. +/// \param memory_pool The memory pool to use for allocations made while running the plan. +/// \param function_registry The function registry to use for function execution. If null +/// then the default function registry will be used. +ARROW_ACERO_EXPORT Future> DeclarationToTableAsync( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToTableAsync accepting a custom exec context +/// +/// The executor must be specified (cannot be null) and must be kept alive until the +/// returned future finishes. +ARROW_ACERO_EXPORT Future> DeclarationToTableAsync( + Declaration declaration, ExecContext custom_exec_context); + +/// \brief a collection of exec batches with a common schema +struct BatchesWithCommonSchema { + std::vector batches; + std::shared_ptr schema; +}; + +/// \brief Utility method to run a declaration and collect the results into ExecBatch +/// vector +/// +/// \see DeclarationToTable for details on threading & execution +ARROW_ACERO_EXPORT Result DeclarationToExecBatches( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +ARROW_ACERO_EXPORT Result DeclarationToExecBatches( + Declaration declaration, QueryOptions query_options); + +/// \brief Asynchronous version of \see DeclarationToExecBatches +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_ACERO_EXPORT Future DeclarationToExecBatchesAsync( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToExecBatchesAsync accepting a custom exec context +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_ACERO_EXPORT Future DeclarationToExecBatchesAsync( + Declaration declaration, ExecContext custom_exec_context); + +/// \brief Utility method to run a declaration and collect the results into a vector +/// +/// \see DeclarationToTable for details on threading & execution +ARROW_ACERO_EXPORT Result>> DeclarationToBatches( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +ARROW_ACERO_EXPORT Result>> DeclarationToBatches( + Declaration declaration, QueryOptions query_options); + +/// \brief Asynchronous version of \see DeclarationToBatches +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_ACERO_EXPORT Future>> +DeclarationToBatchesAsync(Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToBatchesAsync accepting a custom exec context +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_ACERO_EXPORT Future>> +DeclarationToBatchesAsync(Declaration declaration, ExecContext exec_context); + +/// \brief Utility method to run a declaration and return results as a RecordBatchReader +/// +/// If an exec context is not provided then a default exec context will be used based +/// on the value of `use_threads`. If `use_threads` is false then the CPU executor will +/// be a serial executor and all CPU work will be done on the calling thread. I/O tasks +/// will still happen on the I/O executor and may be multi-threaded. +/// +/// If `use_threads` is false then all CPU work will happen during the calls to +/// RecordBatchReader::Next and no CPU work will happen in the background. If +/// `use_threads` is true then CPU work will happen on the CPU thread pool and tasks may +/// run in between calls to RecordBatchReader::Next. If the returned reader is not +/// consumed quickly enough then the plan will eventually pause as the backpressure queue +/// fills up. +/// +/// If a custom exec context is provided then the value of `use_threads` will be ignored. +/// +/// The returned RecordBatchReader can be closed early to cancel the computation of record +/// batches. In this case, only errors encountered by the computation may be reported. In +/// particular, no cancellation error may be reported. +ARROW_ACERO_EXPORT Result> DeclarationToReader( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +ARROW_ACERO_EXPORT Result> DeclarationToReader( + Declaration declaration, QueryOptions query_options); + +/// \brief Utility method to run a declaration and ignore results +/// +/// This can be useful when the data are consumed as part of the plan itself, for +/// example, when the plan ends with a write node. +/// +/// \see DeclarationToTable for details on threading & execution +ARROW_ACERO_EXPORT Status +DeclarationToStatus(Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +ARROW_ACERO_EXPORT Status DeclarationToStatus(Declaration declaration, + QueryOptions query_options); + +/// \brief Asynchronous version of \see DeclarationToStatus +/// +/// This can be useful when the data are consumed as part of the plan itself, for +/// example, when the plan ends with a write node. +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_ACERO_EXPORT Future<> DeclarationToStatusAsync( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToStatusAsync accepting a custom exec context +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_ACERO_EXPORT Future<> DeclarationToStatusAsync(Declaration declaration, + ExecContext exec_context); + +/// @} + +/// \brief Wrap an ExecBatch generator in a RecordBatchReader. +/// +/// The RecordBatchReader does not impose any ordering on emitted batches. +ARROW_ACERO_EXPORT +std::shared_ptr MakeGeneratorReader( + std::shared_ptr, std::function>()>, + MemoryPool*); + +constexpr int kDefaultBackgroundMaxQ = 32; +constexpr int kDefaultBackgroundQRestart = 16; + +/// \brief Make a generator of RecordBatchReaders +/// +/// Useful as a source node for an Exec plan +ARROW_ACERO_EXPORT +Result>()>> MakeReaderGenerator( + std::shared_ptr reader, arrow::internal::Executor* io_executor, + int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart); + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join.h new file mode 100644 index 0000000000000000000000000000000000000000..a81ff274e5e3a46bab8fe7a12902a1c6c62c0bbd --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/acero/accumulation_queue.h" +#include "arrow/acero/bloom_filter.h" +#include "arrow/acero/options.h" +#include "arrow/acero/query_context.h" +#include "arrow/acero/schema_util.h" +#include "arrow/acero/task_util.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/tracing.h" + +namespace arrow { +namespace acero { + +using util::AccumulationQueue; + +class HashJoinImpl { + public: + using OutputBatchCallback = std::function; + using BuildFinishedCallback = std::function; + using FinishedCallback = std::function; + using RegisterTaskGroupCallback = std::function, std::function)>; + using StartTaskGroupCallback = std::function; + using AbortContinuationImpl = std::function; + + virtual ~HashJoinImpl() = default; + virtual Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, + const HashJoinProjectionMaps* proj_map_left, + const HashJoinProjectionMaps* proj_map_right, + std::vector key_cmp, Expression filter, + RegisterTaskGroupCallback register_task_group_callback, + StartTaskGroupCallback start_task_group_callback, + OutputBatchCallback output_batch_callback, + FinishedCallback finished_callback) = 0; + + virtual Status BuildHashTable(size_t thread_index, AccumulationQueue batches, + BuildFinishedCallback on_finished) = 0; + virtual Status ProbeSingleBatch(size_t thread_index, ExecBatch batch) = 0; + virtual Status ProbingFinished(size_t thread_index) = 0; + virtual void Abort(TaskScheduler::AbortContinuationImpl pos_abort_callback) = 0; + virtual std::string ToString() const = 0; + + static Result> MakeBasic(); + static Result> MakeSwiss(); + + protected: + arrow::util::tracing::Span span_; +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join_dict.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join_dict.h new file mode 100644 index 0000000000000000000000000000000000000000..02454a7146278176e27379e6033f79547574a367 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join_dict.h @@ -0,0 +1,318 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/acero/schema_util.h" +#include "arrow/compute/exec.h" +#include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" + +// This file contains hash join logic related to handling of dictionary encoded key +// columns. +// +// A key column from probe side of the join can be matched against a key column from build +// side of the join, as long as the underlying value types are equal. That means that: +// - both scalars and arrays can be used and even mixed in the same column +// - dictionary column can be matched against non-dictionary column if underlying value +// types are equal +// - dictionary column can be matched against dictionary column with a different index +// type, and potentially using a different dictionary, if underlying value types are equal +// +// We currently require in hash join that for all dictionary encoded columns, the same +// dictionary is used in all input exec batches. +// +// In order to allow matching columns with different dictionaries, different dictionary +// index types, and dictionary key against non-dictionary key, internally comparisons will +// be evaluated after remapping values on both sides of the join to a common +// representation (which will be called "unified representation"). This common +// representation is a column of int32() type (not a dictionary column). It represents an +// index in the unified dictionary computed for the (only) dictionary present on build +// side (an empty dictionary is still created for an empty build side). Null value is +// always represented in this common representation as null int32 value, unified +// dictionary will never contain a null value (so there is no ambiguity of representing +// nulls as either index to a null entry in the dictionary or null index). +// +// Unified dictionary represents values present on build side. There may be values on +// probe side that are not present in it. All such values, that are not null, are mapped +// in the common representation to a special constant kMissingValueId. +// + +namespace arrow { + +using compute::ExecBatch; +using compute::ExecContext; +using compute::internal::RowEncoder; + +namespace acero { + +/// Helper class with operations that are stateless and common to processing of dictionary +/// keys on both build and probe side. +class HashJoinDictUtil { + public: + // Null values in unified representation are always represented as null that has + // corresponding integer set to this constant + static constexpr int32_t kNullId = 0; + // Constant representing a value, that is not null, missing on the build side, in + // unified representation. + static constexpr int32_t kMissingValueId = -1; + + // Check if data types of corresponding pair of key column on build and probe side are + // compatible + static bool KeyDataTypesValid(const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type); + + // Input must be dictionary array or dictionary scalar. + // A precomputed and provided here lookup table in the form of int32() array will be + // used to remap input indices to unified representation. + // + static Result> IndexRemapUsingLUT( + ExecContext* ctx, const Datum& indices, int64_t batch_length, + const std::shared_ptr& map_array, + const std::shared_ptr& data_type); + + // Return int32() array that contains indices of input dictionary array or scalar after + // type casting. + static Result> ConvertToInt32( + const std::shared_ptr& from_type, const Datum& input, + int64_t batch_length, ExecContext* ctx); + + // Return an array that contains elements of input int32() array after casting to a + // given integer type. This is used for mapping unified representation stored in the + // hash table on build side back to original input data type of hash join, when + // outputting hash join results to parent exec node. + // + static Result> ConvertFromInt32( + const std::shared_ptr& to_type, const Datum& input, int64_t batch_length, + ExecContext* ctx); + + // Return dictionary referenced in either dictionary array or dictionary scalar + static std::shared_ptr ExtractDictionary(const Datum& data); +}; + +/// Implements processing of dictionary arrays/scalars in key columns on the build side of +/// a hash join. +/// Each instance of this class corresponds to a single column and stores and +/// processes only the information related to that column. +/// Const methods are thread-safe, non-const methods are not (the caller must make sure +/// that only one thread at any time will access them). +/// +class HashJoinDictBuild { + public: + // Returns true if the key column (described in input by its data type) requires any + // pre- or post-processing related to handling dictionaries. + // + static bool KeyNeedsProcessing(const std::shared_ptr& build_data_type) { + return (build_data_type->id() == Type::DICTIONARY); + } + + // Data type of unified representation + static std::shared_ptr DataTypeAfterRemapping() { return int32(); } + + // Should be called only once in hash join, before processing any build or probe + // batches. + // + // Takes a pointer to the dictionary for a corresponding key column on the build side as + // an input. If the build side is empty, it still needs to be called, but with + // dictionary pointer set to null. + // + // Currently it is required that all input batches on build side share the same + // dictionary. For each input batch during its pre-processing, dictionary will be + // checked and error will be returned if it is different then the one provided in the + // call to this method. + // + // Unifies the dictionary. The order of the values is still preserved. + // Null and duplicate entries are removed. If the dictionary is already unified, its + // copy will be produced and stored within this class. + // + // Prepares the mapping from ids within original dictionary to the ids in the resulting + // dictionary. This is used later on to pre-process (map to unified representation) key + // column on build side. + // + // Prepares the reverse mapping (in the form of hash table) from values to the ids in + // the resulting dictionary. This will be used later on to pre-process (map to unified + // representation) key column on probe side. Values on probe side that are not present + // in the original dictionary will be mapped to a special constant kMissingValueId. The + // exception is made for nulls, which get always mapped to nulls (both when null is + // represented as a dictionary id pointing to a null and a null dictionary id). + // + Status Init(ExecContext* ctx, std::shared_ptr dictionary, + std::shared_ptr index_type, std::shared_ptr value_type); + + // Remap array or scalar values into unified representation (array of int32()). + // Outputs kMissingValueId if input value is not found in the unified dictionary. + // Outputs null for null input value (with corresponding data set to kNullId). + // + Result> RemapInputValues(ExecContext* ctx, + const Datum& values, + int64_t batch_length) const; + + // Remap dictionary array or dictionary scalar on build side to unified representation. + // Dictionary referenced in the input must match the dictionary that was + // given during initialization. + // The output is a dictionary array that references unified dictionary. + // + Result> RemapInput( + ExecContext* ctx, const Datum& indices, int64_t batch_length, + const std::shared_ptr& data_type) const; + + // Outputs dictionary array referencing unified dictionary, given an array with 32-bit + // ids. + // Used to post-process values looked up in a hash table on build side of the hash join + // before outputting to the parent exec node. + // + Result> RemapOutput(const ArrayData& indices32Bit, + ExecContext* ctx) const; + + // Release shared pointers and memory + void CleanUp(); + + private: + // Data type of dictionary ids for the input dictionary on build side + std::shared_ptr index_type_; + // Data type of values for the input dictionary on build side + std::shared_ptr value_type_; + // Mapping from (encoded as string) values to the ids in unified dictionary + std::unordered_map hash_table_; + // Mapping from input dictionary ids to unified dictionary ids + std::shared_ptr remapped_ids_; + // Input dictionary + std::shared_ptr dictionary_; + // Unified dictionary + std::shared_ptr unified_dictionary_; +}; + +/// Implements processing of dictionary arrays/scalars in key columns on the probe side of +/// a hash join. +/// Each instance of this class corresponds to a single column and stores and +/// processes only the information related to that column. +/// It is not thread-safe - every participating thread should use its own instance of +/// this class. +/// +class HashJoinDictProbe { + public: + static bool KeyNeedsProcessing(const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type); + + // Data type of the result of remapping input key column. + // + // The result of remapping is what is used in hash join for matching keys on build and + // probe side. The exact data types may be different, as described below, and therefore + // a common representation is needed for simplifying comparisons of pairs of keys on + // both sides. + // + // We support matching key that is of non-dictionary type with key that is of dictionary + // type, as long as the underlying value types are equal. We support matching when both + // keys are of dictionary type, regardless whether underlying dictionary index types are + // the same or not. + // + static std::shared_ptr DataTypeAfterRemapping( + const std::shared_ptr& build_data_type); + + // Should only be called if KeyNeedsProcessing method returns true for a pair of + // corresponding key columns from build and probe side. + // Converts values in order to match the common representation for + // both build and probe side used in hash table comparison. + // Supports arrays and scalars as input. + // Argument opt_build_side should be null if dictionary key on probe side is matched + // with non-dictionary key on build side. + // + Result> RemapInput( + const HashJoinDictBuild* opt_build_side, const Datum& data, int64_t batch_length, + const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type, ExecContext* ctx); + + void CleanUp(); + + private: + // May be null if probe side key is non-dictionary. Otherwise it is used to verify that + // only a single dictionary is referenced in exec batch on probe side of hash join. + std::shared_ptr dictionary_; + // Mapping from dictionary on probe side of hash join (if it is used) to unified + // representation. + std::shared_ptr remapped_ids_; + // Encoder of key columns that uses unified representation instead of original data type + // for key columns that need to use it (have dictionaries on either side of the join). + RowEncoder encoder_; +}; + +// Encapsulates dictionary handling logic for build side of hash join. +// +class HashJoinDictBuildMulti { + public: + Status Init(const SchemaProjectionMaps& proj_map, + const ExecBatch* opt_non_empty_batch, ExecContext* ctx); + static void InitEncoder(const SchemaProjectionMaps& proj_map, + RowEncoder* encoder, ExecContext* ctx); + Status EncodeBatch(size_t thread_index, + const SchemaProjectionMaps& proj_map, + const ExecBatch& batch, RowEncoder* encoder, ExecContext* ctx) const; + Status PostDecode(const SchemaProjectionMaps& proj_map, + ExecBatch* decoded_key_batch, ExecContext* ctx); + const HashJoinDictBuild& get_dict_build(int icol) const { return remap_imp_[icol]; } + + private: + std::vector needs_remap_; + std::vector remap_imp_; +}; + +// Encapsulates dictionary handling logic for probe side of hash join +// +class HashJoinDictProbeMulti { + public: + void Init(size_t num_threads); + bool BatchRemapNeeded(size_t thread_index, + const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, + ExecContext* ctx); + Status EncodeBatch(size_t thread_index, + const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, + const HashJoinDictBuildMulti& dict_build, const ExecBatch& batch, + RowEncoder** out_encoder, ExecBatch* opt_out_key_batch, + ExecContext* ctx); + + private: + void InitLocalStateIfNeeded( + size_t thread_index, const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, ExecContext* ctx); + static void InitEncoder(const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, + RowEncoder* encoder, ExecContext* ctx); + struct ThreadLocalState { + bool is_initialized; + // Whether any key column needs remapping (because of dictionaries used) before doing + // join hash table lookups + bool any_needs_remap; + // Whether each key column needs remapping before doing join hash table lookups + std::vector needs_remap; + std::vector remap_imp; + // Encoder of key columns that uses unified representation instead of original data + // type for key columns that need to use it (have dictionaries on either side of the + // join). + RowEncoder post_remap_encoder; + }; + std::vector local_states_; +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join_node.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join_node.h new file mode 100644 index 0000000000000000000000000000000000000000..19745b8675cf0c63ed92c6e5448c9e6a68467f59 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/hash_join_node.h @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/acero/options.h" +#include "arrow/acero/schema_util.h" +#include "arrow/result.h" +#include "arrow/status.h" + +namespace arrow { + +using compute::ExecContext; + +namespace acero { + +class ARROW_ACERO_EXPORT HashJoinSchema { + public: + Status Init(JoinType join_type, const Schema& left_schema, + const std::vector& left_keys, const Schema& right_schema, + const std::vector& right_keys, const Expression& filter, + const std::string& left_field_name_prefix, + const std::string& right_field_name_prefix); + + Status Init(JoinType join_type, const Schema& left_schema, + const std::vector& left_keys, + const std::vector& left_output, const Schema& right_schema, + const std::vector& right_keys, + const std::vector& right_output, const Expression& filter, + const std::string& left_field_name_prefix, + const std::string& right_field_name_prefix); + + static Status ValidateSchemas(JoinType join_type, const Schema& left_schema, + const std::vector& left_keys, + const std::vector& left_output, + const Schema& right_schema, + const std::vector& right_keys, + const std::vector& right_output, + const std::string& left_field_name_prefix, + const std::string& right_field_name_prefix); + + bool HasDictionaries() const; + + bool HasLargeBinary() const; + + Result BindFilter(Expression filter, const Schema& left_schema, + const Schema& right_schema, ExecContext* exec_context); + std::shared_ptr MakeOutputSchema(const std::string& left_field_name_suffix, + const std::string& right_field_name_suffix); + + bool LeftPayloadIsEmpty() const { return PayloadIsEmpty(0); } + + bool RightPayloadIsEmpty() const { return PayloadIsEmpty(1); } + + static int kMissingField() { + return SchemaProjectionMaps::kMissingField; + } + + SchemaProjectionMaps proj_maps[2]; + + private: + static bool IsTypeSupported(const DataType& type); + + Status CollectFilterColumns(std::vector& left_filter, + std::vector& right_filter, + const Expression& filter, const Schema& left_schema, + const Schema& right_schema); + + Expression RewriteFilterToUseFilterSchema(int right_filter_offset, + const SchemaProjectionMap& left_to_filter, + const SchemaProjectionMap& right_to_filter, + const Expression& filter); + + bool PayloadIsEmpty(int side) const { + assert(side == 0 || side == 1); + return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0; + } + + static Result> ComputePayload(const Schema& schema, + const std::vector& output, + const std::vector& filter, + const std::vector& key); +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/map_node.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/map_node.h new file mode 100644 index 0000000000000000000000000000000000000000..8bdd0ab2ca3854c6561aa3735ae143e7c58b4f77 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/map_node.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/acero/exec_plan.h" +#include "arrow/acero/util.h" +#include "arrow/acero/visibility.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/cancel.h" +#include "arrow/util/type_fwd.h" + +namespace arrow { +namespace acero { + +/// A utility base class for simple exec nodes with one input +/// +/// Pause/Resume Producing are forwarded appropriately +/// There is nothing to do in StopProducingImpl +/// +/// An AtomicCounter is used to keep track of when all data has arrived. When it +/// has the Finish() method will be invoked +class ARROW_ACERO_EXPORT MapNode : public ExecNode, public TracedNode { + public: + MapNode(ExecPlan* plan, std::vector inputs, + std::shared_ptr output_schema); + + Status InputFinished(ExecNode* input, int total_batches) override; + + Status StartProducing() override; + + void PauseProducing(ExecNode* output, int32_t counter) override; + + void ResumeProducing(ExecNode* output, int32_t counter) override; + + Status InputReceived(ExecNode* input, ExecBatch batch) override; + + const Ordering& ordering() const override; + + protected: + Status StopProducingImpl() override; + + /// Transform a batch + /// + /// The output batch will have the same guarantee as the input batch + /// If this was the last batch this call may trigger Finish() + virtual Result ProcessBatch(ExecBatch batch) = 0; + + /// Function called after all data has been received + /// + /// By default this does nothing. Override this to provide a custom implementation. + virtual void Finish(); + + protected: + // Counter for the number of batches received + AtomicCounter input_counter_; +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/options.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/options.h new file mode 100644 index 0000000000000000000000000000000000000000..4447e9c67a19930111c3a3df33ae874fd330700c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/options.h @@ -0,0 +1,866 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/acero/type_fwd.h" +#include "arrow/acero/visibility.h" +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/exec.h" +#include "arrow/compute/expression.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/async_util.h" + +namespace arrow { + +using compute::Aggregate; +using compute::ExecBatch; +using compute::Expression; +using compute::literal; +using compute::Ordering; +using compute::SelectKOptions; +using compute::SortOptions; + +namespace internal { + +class Executor; + +} // namespace internal + +namespace acero { + +/// \brief This must not be used in release-mode +struct DebugOptions; + +using AsyncExecBatchGenerator = AsyncGenerator>; + +/// \addtogroup acero-nodes +/// @{ + +/// \brief A base class for all options objects +/// +/// The only time this is used directly is when a node has no configuration +class ARROW_ACERO_EXPORT ExecNodeOptions { + public: + virtual ~ExecNodeOptions() = default; + + /// \brief This must not be used in release-mode + std::shared_ptr debug_opts; +}; + +/// \brief A node representing a generic source of data for Acero +/// +/// The source node will start calling `generator` during StartProducing. An initial +/// task will be created that will call `generator`. It will not call `generator` +/// reentrantly. If the source can be read in parallel then those details should be +/// encapsulated within `generator`. +/// +/// For each batch received a new task will be created to push that batch downstream. +/// This task will slice smaller units of size `ExecPlan::kMaxBatchSize` from the +/// parent batch and call InputReceived. Thus, if the `generator` yields a large +/// batch it may result in several calls to InputReceived. +/// +/// The SourceNode will, by default, assign an implicit ordering to outgoing batches. +/// This is valid as long as the generator generates batches in a deterministic fashion. +/// Currently, the only way to override this is to subclass the SourceNode. +/// +/// This node is not generally used directly but can serve as the basis for various +/// specialized nodes. +class ARROW_ACERO_EXPORT SourceNodeOptions : public ExecNodeOptions { + public: + /// Create an instance from values + SourceNodeOptions(std::shared_ptr output_schema, + std::function>()> generator) + : output_schema(std::move(output_schema)), generator(std::move(generator)) {} + + /// \brief the schema for batches that will be generated by this source + std::shared_ptr output_schema; + /// \brief an asynchronous stream of batches ending with std::nullopt + std::function>()> generator; +}; + +/// \brief a node that generates data from a table already loaded in memory +/// +/// The table source node will slice off chunks, defined by `max_batch_size` +/// for parallel processing. The table source node extends source node and so these +/// chunks will be iteratively processed in small batches. \see SourceNodeOptions +/// for details. +class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions { + public: + static constexpr int64_t kDefaultMaxBatchSize = 1 << 20; + + /// Create an instance from values + TableSourceNodeOptions(std::shared_ptr
table, + int64_t max_batch_size = kDefaultMaxBatchSize) + : table(std::move(table)), max_batch_size(max_batch_size) {} + + /// \brief a table which acts as the data source + std::shared_ptr
table; + /// \brief size of batches to emit from this node + /// If the table is larger the node will emit multiple batches from the + /// the table to be processed in parallel. + int64_t max_batch_size; +}; + +/// \brief define a lazily resolved Arrow table. +/// +/// The table uniquely identified by the names can typically be resolved at the time when +/// the plan is to be consumed. +/// +/// This node is for serialization purposes only and can never be executed. +class ARROW_ACERO_EXPORT NamedTableNodeOptions : public ExecNodeOptions { + public: + /// Create an instance from values + NamedTableNodeOptions(std::vector names, std::shared_ptr schema) + : names(std::move(names)), schema(std::move(schema)) {} + + /// \brief the names to put in the serialized plan + std::vector names; + /// \brief the output schema of the table + std::shared_ptr schema; +}; + +/// \brief a source node which feeds data from a synchronous iterator of batches +/// +/// ItMaker is a maker of an iterator of tabular data. +/// +/// The node can be configured to use an I/O executor. If set then each time the +/// iterator is polled a new I/O thread task will be created to do the polling. This +/// allows a blocking iterator to stay off the CPU thread pool. +template +class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions { + public: + /// Create an instance that will create a new task on io_executor for each iteration + SchemaSourceNodeOptions(std::shared_ptr schema, ItMaker it_maker, + arrow::internal::Executor* io_executor) + : schema(std::move(schema)), + it_maker(std::move(it_maker)), + io_executor(io_executor), + requires_io(true) {} + + /// Create an instance that will either iterate synchronously or use the default I/O + /// executor + SchemaSourceNodeOptions(std::shared_ptr schema, ItMaker it_maker, + bool requires_io = false) + : schema(std::move(schema)), + it_maker(std::move(it_maker)), + io_executor(NULLPTR), + requires_io(requires_io) {} + + /// \brief The schema of the record batches from the iterator + std::shared_ptr schema; + + /// \brief A maker of an iterator which acts as the data source + ItMaker it_maker; + + /// \brief The executor to use for scanning the iterator + /// + /// Defaults to the default I/O executor. Only used if requires_io is true. + /// If requires_io is false then this MUST be nullptr. + arrow::internal::Executor* io_executor; + + /// \brief If true then items will be fetched from the iterator on a dedicated I/O + /// thread to keep I/O off the CPU thread + bool requires_io; +}; + +/// a source node that reads from a RecordBatchReader +/// +/// Each iteration of the RecordBatchReader will be run on a new thread task created +/// on the I/O thread pool. +class ARROW_ACERO_EXPORT RecordBatchReaderSourceNodeOptions : public ExecNodeOptions { + public: + /// Create an instance from values + RecordBatchReaderSourceNodeOptions(std::shared_ptr reader, + arrow::internal::Executor* io_executor = NULLPTR) + : reader(std::move(reader)), io_executor(io_executor) {} + + /// \brief The RecordBatchReader which acts as the data source + std::shared_ptr reader; + + /// \brief The executor to use for the reader + /// + /// Defaults to the default I/O executor. + arrow::internal::Executor* io_executor; +}; + +/// a source node that reads from an iterator of array vectors +using ArrayVectorIteratorMaker = std::function>()>; +/// \brief An extended Source node which accepts a schema and array-vectors +class ARROW_ACERO_EXPORT ArrayVectorSourceNodeOptions + : public SchemaSourceNodeOptions { + using SchemaSourceNodeOptions::SchemaSourceNodeOptions; +}; + +/// a source node that reads from an iterator of ExecBatch +using ExecBatchIteratorMaker = std::function>()>; +/// \brief An extended Source node which accepts a schema and exec-batches +class ARROW_ACERO_EXPORT ExecBatchSourceNodeOptions + : public SchemaSourceNodeOptions { + public: + using SchemaSourceNodeOptions::SchemaSourceNodeOptions; + ExecBatchSourceNodeOptions(std::shared_ptr schema, + std::vector batches, + ::arrow::internal::Executor* io_executor); + ExecBatchSourceNodeOptions(std::shared_ptr schema, + std::vector batches, bool requires_io = false); +}; + +using RecordBatchIteratorMaker = std::function>()>; +/// a source node that reads from an iterator of RecordBatch +class ARROW_ACERO_EXPORT RecordBatchSourceNodeOptions + : public SchemaSourceNodeOptions { + using SchemaSourceNodeOptions::SchemaSourceNodeOptions; +}; + +/// \brief a node which excludes some rows from batches passed through it +/// +/// filter_expression will be evaluated against each batch which is pushed to +/// this node. Any rows for which filter_expression does not evaluate to `true` will be +/// excluded in the batch emitted by this node. +/// +/// This node will emit empty batches if all rows are excluded. This is done +/// to avoid gaps in the ordering. +class ARROW_ACERO_EXPORT FilterNodeOptions : public ExecNodeOptions { + public: + /// \brief create an instance from values + explicit FilterNodeOptions(Expression filter_expression) + : filter_expression(std::move(filter_expression)) {} + + /// \brief the expression to filter batches + /// + /// The return type of this expression must be boolean + Expression filter_expression; +}; + +/// \brief a node which selects a specified subset from the input +class ARROW_ACERO_EXPORT FetchNodeOptions : public ExecNodeOptions { + public: + static constexpr std::string_view kName = "fetch"; + /// \brief create an instance from values + FetchNodeOptions(int64_t offset, int64_t count) : offset(offset), count(count) {} + /// \brief the number of rows to skip + int64_t offset; + /// \brief the number of rows to keep (not counting skipped rows) + int64_t count; +}; + +/// \brief a node which executes expressions on input batches, producing batches +/// of the same length with new columns. +/// +/// Each expression will be evaluated against each batch which is pushed to +/// this node to produce a corresponding output column. +/// +/// If names are not provided, the string representations of exprs will be used. +class ARROW_ACERO_EXPORT ProjectNodeOptions : public ExecNodeOptions { + public: + /// \brief create an instance from values + explicit ProjectNodeOptions(std::vector expressions, + std::vector names = {}) + : expressions(std::move(expressions)), names(std::move(names)) {} + + /// \brief the expressions to run on the batches + /// + /// The output will have one column for each expression. If you wish to keep any of + /// the columns from the input then you should create a simple field_ref expression + /// for that column. + std::vector expressions; + /// \brief the names of the output columns + /// + /// If this is not specified then the result of calling ToString on the expression will + /// be used instead + /// + /// This list should either be empty or have the same length as `expressions` + std::vector names; +}; + +/// \brief a node which aggregates input batches and calculates summary statistics +/// +/// The node can summarize the entire input or it can group the input with grouping keys +/// and segment keys. +/// +/// By default, the aggregate node is a pipeline breaker. It must accumulate all input +/// before any output is produced. Segment keys are a performance optimization. If +/// you know your input is already partitioned by one or more columns then you can +/// specify these as segment keys. At each change in the segment keys the node will +/// emit values for all data seen so far. +/// +/// Segment keys are currently limited to single-threaded mode. +/// +/// Both keys and segment-keys determine the group. However segment-keys are also used +/// for determining grouping segments, which should be large, and allow streaming a +/// partial aggregation result after processing each segment. One common use-case for +/// segment-keys is ordered aggregation, in which the segment-key attribute specifies a +/// column with non-decreasing values or a lexicographically-ordered set of such columns. +/// +/// If the keys attribute is a non-empty vector, then each aggregate in `aggregates` is +/// expected to be a HashAggregate function. If the keys attribute is an empty vector, +/// then each aggregate is assumed to be a ScalarAggregate function. +/// +/// If the segment_keys attribute is a non-empty vector, then segmented aggregation, as +/// described above, applies. +/// +/// The keys and segment_keys vectors must be disjoint. +/// +/// If no measures are provided then you will simply get the list of unique keys. +/// +/// This node outputs segment keys first, followed by regular keys, followed by one +/// column for each aggregate. +class ARROW_ACERO_EXPORT AggregateNodeOptions : public ExecNodeOptions { + public: + /// \brief create an instance from values + explicit AggregateNodeOptions(std::vector aggregates, + std::vector keys = {}, + std::vector segment_keys = {}) + : aggregates(std::move(aggregates)), + keys(std::move(keys)), + segment_keys(std::move(segment_keys)) {} + + // aggregations which will be applied to the targeted fields + std::vector aggregates; + // keys by which aggregations will be grouped (optional) + std::vector keys; + // keys by which aggregations will be segmented (optional) + std::vector segment_keys; +}; + +/// \brief a default value at which backpressure will be applied +constexpr int32_t kDefaultBackpressureHighBytes = 1 << 30; // 1GiB +/// \brief a default value at which backpressure will be removed +constexpr int32_t kDefaultBackpressureLowBytes = 1 << 28; // 256MiB + +/// \brief an interface that can be queried for backpressure statistics +class ARROW_ACERO_EXPORT BackpressureMonitor { + public: + virtual ~BackpressureMonitor() = default; + /// \brief fetches the number of bytes currently queued up + virtual uint64_t bytes_in_use() = 0; + /// \brief checks to see if backpressure is currently applied + virtual bool is_paused() = 0; +}; + +/// \brief Options to control backpressure behavior +struct ARROW_ACERO_EXPORT BackpressureOptions { + /// \brief Create default options that perform no backpressure + BackpressureOptions() : resume_if_below(0), pause_if_above(0) {} + /// \brief Create options that will perform backpressure + /// + /// \param resume_if_below The producer should resume producing if the backpressure + /// queue has fewer than resume_if_below items. + /// \param pause_if_above The producer should pause producing if the backpressure + /// queue has more than pause_if_above items + BackpressureOptions(uint64_t resume_if_below, uint64_t pause_if_above) + : resume_if_below(resume_if_below), pause_if_above(pause_if_above) {} + + /// \brief create an instance using default values for backpressure limits + static BackpressureOptions DefaultBackpressure() { + return BackpressureOptions(kDefaultBackpressureLowBytes, + kDefaultBackpressureHighBytes); + } + + /// \brief helper method to determine if backpressure is disabled + /// \return true if pause_if_above is greater than zero, false otherwise + bool should_apply_backpressure() const { return pause_if_above > 0; } + + /// \brief the number of bytes at which the producer should resume producing + uint64_t resume_if_below; + /// \brief the number of bytes at which the producer should pause producing + /// + /// If this is <= 0 then backpressure will be disabled + uint64_t pause_if_above; +}; + +/// \brief a sink node which collects results in a queue +/// +/// Emitted batches will only be ordered if there is a meaningful ordering +/// and sequence_output is not set to false. +class ARROW_ACERO_EXPORT SinkNodeOptions : public ExecNodeOptions { + public: + explicit SinkNodeOptions(std::function>()>* generator, + std::shared_ptr* schema, + BackpressureOptions backpressure = {}, + BackpressureMonitor** backpressure_monitor = NULLPTR, + std::optional sequence_output = std::nullopt) + : generator(generator), + schema(schema), + backpressure(backpressure), + backpressure_monitor(backpressure_monitor), + sequence_output(sequence_output) {} + + explicit SinkNodeOptions(std::function>()>* generator, + BackpressureOptions backpressure = {}, + BackpressureMonitor** backpressure_monitor = NULLPTR, + std::optional sequence_output = std::nullopt) + : generator(generator), + schema(NULLPTR), + backpressure(std::move(backpressure)), + backpressure_monitor(backpressure_monitor), + sequence_output(sequence_output) {} + + /// \brief A pointer to a generator of batches. + /// + /// This will be set when the node is added to the plan and should be used to consume + /// data from the plan. If this function is not called frequently enough then the sink + /// node will start to accumulate data and may apply backpressure. + std::function>()>* generator; + /// \brief A pointer which will be set to the schema of the generated batches + /// + /// This is optional, if nullptr is passed in then it will be ignored. + /// This will be set when the node is added to the plan, before StartProducing is called + std::shared_ptr* schema; + /// \brief Options to control when to apply backpressure + /// + /// This is optional, the default is to never apply backpressure. If the plan is not + /// consumed quickly enough the system may eventually run out of memory. + BackpressureOptions backpressure; + /// \brief A pointer to a backpressure monitor + /// + /// This will be set when the node is added to the plan. This can be used to inspect + /// the amount of data currently queued in the sink node. This is an optional utility + /// and backpressure can be applied even if this is not used. + BackpressureMonitor** backpressure_monitor; + /// \brief Controls whether batches should be emitted immediately or sequenced in order + /// + /// \see QueryOptions for more details + std::optional sequence_output; +}; + +/// \brief Control used by a SinkNodeConsumer to pause & resume +/// +/// Callers should ensure that they do not call Pause and Resume simultaneously and they +/// should sequence things so that a call to Pause() is always followed by an eventual +/// call to Resume() +class ARROW_ACERO_EXPORT BackpressureControl { + public: + virtual ~BackpressureControl() = default; + /// \brief Ask the input to pause + /// + /// This is best effort, batches may continue to arrive + /// Must eventually be followed by a call to Resume() or deadlock will occur + virtual void Pause() = 0; + /// \brief Ask the input to resume + virtual void Resume() = 0; +}; + +/// \brief a sink node that consumes the data as part of the plan using callbacks +class ARROW_ACERO_EXPORT SinkNodeConsumer { + public: + virtual ~SinkNodeConsumer() = default; + /// \brief Prepare any consumer state + /// + /// This will be run once the schema is finalized as the plan is starting and + /// before any calls to Consume. A common use is to save off the schema so that + /// batches can be interpreted. + virtual Status Init(const std::shared_ptr& schema, + BackpressureControl* backpressure_control, ExecPlan* plan) = 0; + /// \brief Consume a batch of data + virtual Status Consume(ExecBatch batch) = 0; + /// \brief Signal to the consumer that the last batch has been delivered + /// + /// The returned future should only finish when all outstanding tasks have completed + /// + /// If the plan is ended early or aborts due to an error then this will not be + /// called. + virtual Future<> Finish() = 0; +}; + +/// \brief Add a sink node which consumes data within the exec plan run +class ARROW_ACERO_EXPORT ConsumingSinkNodeOptions : public ExecNodeOptions { + public: + explicit ConsumingSinkNodeOptions(std::shared_ptr consumer, + std::vector names = {}, + std::optional sequence_output = std::nullopt) + : consumer(std::move(consumer)), + names(std::move(names)), + sequence_output(sequence_output) {} + + std::shared_ptr consumer; + /// \brief Names to rename the sink's schema fields to + /// + /// If specified then names must be provided for all fields. Currently, only a flat + /// schema is supported (see GH-31875). + /// + /// If not specified then names will be generated based on the source data. + std::vector names; + /// \brief Controls whether batches should be emitted immediately or sequenced in order + /// + /// \see QueryOptions for more details + std::optional sequence_output; +}; + +/// \brief Make a node which sorts rows passed through it +/// +/// All batches pushed to this node will be accumulated, then sorted, by the given +/// fields. Then sorted batches will be forwarded to the generator in sorted order. +class ARROW_ACERO_EXPORT OrderBySinkNodeOptions : public SinkNodeOptions { + public: + /// \brief create an instance from values + explicit OrderBySinkNodeOptions( + SortOptions sort_options, + std::function>()>* generator) + : SinkNodeOptions(generator), sort_options(std::move(sort_options)) {} + + /// \brief options describing which columns and direction to sort + SortOptions sort_options; +}; + +/// \brief Apply a new ordering to data +/// +/// Currently this node works by accumulating all data, sorting, and then emitting +/// the new data with an updated batch index. +/// +/// Larger-than-memory sort is not currently supported. +class ARROW_ACERO_EXPORT OrderByNodeOptions : public ExecNodeOptions { + public: + static constexpr std::string_view kName = "order_by"; + explicit OrderByNodeOptions(Ordering ordering) : ordering(std::move(ordering)) {} + + /// \brief The new ordering to apply to outgoing data + Ordering ordering; +}; + +enum class JoinType { + LEFT_SEMI, + RIGHT_SEMI, + LEFT_ANTI, + RIGHT_ANTI, + INNER, + LEFT_OUTER, + RIGHT_OUTER, + FULL_OUTER +}; + +std::string ToString(JoinType t); + +enum class JoinKeyCmp { EQ, IS }; + +/// \brief a node which implements a join operation using a hash table +class ARROW_ACERO_EXPORT HashJoinNodeOptions : public ExecNodeOptions { + public: + static constexpr const char* default_output_suffix_for_left = ""; + static constexpr const char* default_output_suffix_for_right = ""; + /// \brief create an instance from values that outputs all columns + HashJoinNodeOptions( + JoinType in_join_type, std::vector in_left_keys, + std::vector in_right_keys, Expression filter = literal(true), + std::string output_suffix_for_left = default_output_suffix_for_left, + std::string output_suffix_for_right = default_output_suffix_for_right, + bool disable_bloom_filter = false) + : join_type(in_join_type), + left_keys(std::move(in_left_keys)), + right_keys(std::move(in_right_keys)), + output_all(true), + output_suffix_for_left(std::move(output_suffix_for_left)), + output_suffix_for_right(std::move(output_suffix_for_right)), + filter(std::move(filter)), + disable_bloom_filter(disable_bloom_filter) { + this->key_cmp.resize(this->left_keys.size()); + for (size_t i = 0; i < this->left_keys.size(); ++i) { + this->key_cmp[i] = JoinKeyCmp::EQ; + } + } + /// \brief create an instance from keys + /// + /// This will create an inner join that outputs all columns and has no post join filter + /// + /// `in_left_keys` should have the same length and types as `in_right_keys` + /// @param in_left_keys the keys in the left input + /// @param in_right_keys the keys in the right input + HashJoinNodeOptions(std::vector in_left_keys, + std::vector in_right_keys) + : left_keys(std::move(in_left_keys)), right_keys(std::move(in_right_keys)) { + this->join_type = JoinType::INNER; + this->output_all = true; + this->output_suffix_for_left = default_output_suffix_for_left; + this->output_suffix_for_right = default_output_suffix_for_right; + this->key_cmp.resize(this->left_keys.size()); + for (size_t i = 0; i < this->left_keys.size(); ++i) { + this->key_cmp[i] = JoinKeyCmp::EQ; + } + this->filter = literal(true); + } + /// \brief create an instance from values using JoinKeyCmp::EQ for all comparisons + HashJoinNodeOptions( + JoinType join_type, std::vector left_keys, + std::vector right_keys, std::vector left_output, + std::vector right_output, Expression filter = literal(true), + std::string output_suffix_for_left = default_output_suffix_for_left, + std::string output_suffix_for_right = default_output_suffix_for_right, + bool disable_bloom_filter = false) + : join_type(join_type), + left_keys(std::move(left_keys)), + right_keys(std::move(right_keys)), + output_all(false), + left_output(std::move(left_output)), + right_output(std::move(right_output)), + output_suffix_for_left(std::move(output_suffix_for_left)), + output_suffix_for_right(std::move(output_suffix_for_right)), + filter(std::move(filter)), + disable_bloom_filter(disable_bloom_filter) { + this->key_cmp.resize(this->left_keys.size()); + for (size_t i = 0; i < this->left_keys.size(); ++i) { + this->key_cmp[i] = JoinKeyCmp::EQ; + } + } + /// \brief create an instance from values + HashJoinNodeOptions( + JoinType join_type, std::vector left_keys, + std::vector right_keys, std::vector left_output, + std::vector right_output, std::vector key_cmp, + Expression filter = literal(true), + std::string output_suffix_for_left = default_output_suffix_for_left, + std::string output_suffix_for_right = default_output_suffix_for_right, + bool disable_bloom_filter = false) + : join_type(join_type), + left_keys(std::move(left_keys)), + right_keys(std::move(right_keys)), + output_all(false), + left_output(std::move(left_output)), + right_output(std::move(right_output)), + key_cmp(std::move(key_cmp)), + output_suffix_for_left(std::move(output_suffix_for_left)), + output_suffix_for_right(std::move(output_suffix_for_right)), + filter(std::move(filter)), + disable_bloom_filter(disable_bloom_filter) {} + + HashJoinNodeOptions() = default; + + // type of join (inner, left, semi...) + JoinType join_type = JoinType::INNER; + // key fields from left input + std::vector left_keys; + // key fields from right input + std::vector right_keys; + // if set all valid fields from both left and right input will be output + // (and field ref vectors for output fields will be ignored) + bool output_all = false; + // output fields passed from left input + std::vector left_output; + // output fields passed from right input + std::vector right_output; + // key comparison function (determines whether a null key is equal another null + // key or not) + std::vector key_cmp; + // suffix added to names of output fields coming from left input (used to distinguish, + // if necessary, between fields of the same name in left and right input and can be left + // empty if there are no name collisions) + std::string output_suffix_for_left; + // suffix added to names of output fields coming from right input + std::string output_suffix_for_right; + // residual filter which is applied to matching rows. Rows that do not match + // the filter are not included. The filter is applied against the + // concatenated input schema (left fields then right fields) and can reference + // fields that are not included in the output. + Expression filter = literal(true); + // whether or not to disable Bloom filters in this join + bool disable_bloom_filter = false; +}; + +/// \brief a node which implements the asof join operation +/// +/// Note, this API is experimental and will change in the future +/// +/// This node takes one left table and any number of right tables, and asof joins them +/// together. Batches produced by each input must be ordered by the "on" key. +/// This node will output one row for each row in the left table. +class ARROW_ACERO_EXPORT AsofJoinNodeOptions : public ExecNodeOptions { + public: + /// \brief Keys for one input table of the AsofJoin operation + /// + /// The keys must be consistent across the input tables: + /// Each "on" key must refer to a field of the same type and units across the tables. + /// Each "by" key must refer to a list of fields of the same types across the tables. + struct Keys { + /// \brief "on" key for the join. + /// + /// The input table must be sorted by the "on" key. Must be a single field of a common + /// type. Inexact match is used on the "on" key. i.e., a row is considered a match iff + /// left_on - tolerance <= right_on <= left_on. + /// Currently, the "on" key must be of an integer, date, or timestamp type. + FieldRef on_key; + /// \brief "by" key for the join. + /// + /// Each input table must have each field of the "by" key. Exact equality is used for + /// each field of the "by" key. + /// Currently, each field of the "by" key must be of an integer, date, timestamp, or + /// base-binary type. + std::vector by_key; + }; + + AsofJoinNodeOptions(std::vector input_keys, int64_t tolerance) + : input_keys(std::move(input_keys)), tolerance(tolerance) {} + + /// \brief AsofJoin keys per input table. At least two keys must be given. The first key + /// corresponds to a left table and all other keys correspond to right tables for the + /// as-of-join. + /// + /// \see `Keys` for details. + std::vector input_keys; + /// \brief Tolerance for inexact "on" key matching. A right row is considered a match + /// with the left row if `right.on - left.on <= tolerance`. The `tolerance` may be: + /// - negative, in which case a past-as-of-join occurs; + /// - or positive, in which case a future-as-of-join occurs; + /// - or zero, in which case an exact-as-of-join occurs. + /// + /// The tolerance is interpreted in the same units as the "on" key. + int64_t tolerance; +}; + +/// \brief a node which select top_k/bottom_k rows passed through it +/// +/// All batches pushed to this node will be accumulated, then selected, by the given +/// fields. Then sorted batches will be forwarded to the generator in sorted order. +class ARROW_ACERO_EXPORT SelectKSinkNodeOptions : public SinkNodeOptions { + public: + explicit SelectKSinkNodeOptions( + SelectKOptions select_k_options, + std::function>()>* generator) + : SinkNodeOptions(generator), select_k_options(std::move(select_k_options)) {} + + /// SelectK options + SelectKOptions select_k_options; +}; + +/// \brief a sink node which accumulates all output into a table +class ARROW_ACERO_EXPORT TableSinkNodeOptions : public ExecNodeOptions { + public: + /// \brief create an instance from values + explicit TableSinkNodeOptions(std::shared_ptr
* output_table, + std::optional sequence_output = std::nullopt) + : output_table(output_table), sequence_output(sequence_output) {} + + /// \brief an "out parameter" specifying the table that will be created + /// + /// Must not be null and remain valid for the entirety of the plan execution. After the + /// plan has completed this will be set to point to the result table + std::shared_ptr
* output_table; + /// \brief Controls whether batches should be emitted immediately or sequenced in order + /// + /// \see QueryOptions for more details + std::optional sequence_output; + /// \brief Custom names to use for the columns. + /// + /// If specified then names must be provided for all fields. Currently, only a flat + /// schema is supported (see GH-31875). + /// + /// If not specified then names will be generated based on the source data. + std::vector names; +}; + +/// \brief a row template that describes one row that will be generated for each input row +struct ARROW_ACERO_EXPORT PivotLongerRowTemplate { + PivotLongerRowTemplate(std::vector feature_values, + std::vector> measurement_values) + : feature_values(std::move(feature_values)), + measurement_values(std::move(measurement_values)) {} + /// A (typically unique) set of feature values for the template, usually derived from a + /// column name + /// + /// These will be used to populate the feature columns + std::vector feature_values; + /// The fields containing the measurements to use for this row + /// + /// These will be used to populate the measurement columns. If nullopt then nulls + /// will be inserted for the given value. + std::vector> measurement_values; +}; + +/// \brief Reshape a table by turning some columns into additional rows +/// +/// This operation is sometimes also referred to as UNPIVOT +/// +/// This is typically done when there are multiple observations in each row in order to +/// transform to a table containing a single observation per row. +/// +/// For example: +/// +/// | time | left_temp | right_temp | +/// | ---- | --------- | ---------- | +/// | 1 | 10 | 20 | +/// | 2 | 15 | 18 | +/// +/// The above table contains two observations per row. There is an implicit feature +/// "location" (left vs right) and a measurement "temp". What we really want is: +/// +/// | time | location | temp | +/// | --- | --- | --- | +/// | 1 | left | 10 | +/// | 1 | right | 20 | +/// | 2 | left | 15 | +/// | 2 | right | 18 | +/// +/// For a more complex example consider: +/// +/// | time | ax1 | ay1 | bx1 | ay2 | +/// | ---- | --- | --- | --- | --- | +/// | 0 | 1 | 2 | 3 | 4 | +/// +/// We can pretend a vs b and x vs y are features while 1 and 2 are two different +/// kinds of measurements. We thus want to pivot to +/// +/// | time | a/b | x/y | f1 | f2 | +/// | ---- | --- | --- | ---- | ---- | +/// | 0 | a | x | 1 | null | +/// | 0 | a | y | 2 | 4 | +/// | 0 | b | x | 3 | null | +/// +/// To do this we create a row template for each combination of features. One should +/// be able to do this purely by looking at the column names. For example, given the +/// above columns "ax1", "ay1", "bx1", and "ay2" we know we have three feature +/// combinations (a, x), (a, y), and (b, x). Similarly, we know we have two possible +/// measurements, "1" and "2". +/// +/// For each combination of features we create a row template. In each row template we +/// describe the combination and then list which columns to use for the measurements. +/// If a measurement doesn't exist for a given combination then we use nullopt. +/// +/// So, for our above example, we have: +/// +/// (a, x): names={"a", "x"}, values={"ax1", nullopt} +/// (a, y): names={"a", "y"}, values={"ay1", "ay2"} +/// (b, x): names={"b", "x"}, values={"bx1", nullopt} +/// +/// Finishing it off we name our new columns: +/// feature_field_names={"a/b","x/y"} +/// measurement_field_names={"f1", "f2"} +class ARROW_ACERO_EXPORT PivotLongerNodeOptions : public ExecNodeOptions { + public: + static constexpr std::string_view kName = "pivot_longer"; + /// One or more row templates to create new output rows + /// + /// Normally there are at least two row templates. The output # of rows + /// will be the input # of rows * the number of row templates + std::vector row_templates; + /// The names of the columns which describe the new features + std::vector feature_field_names; + /// The names of the columns which represent the measurements + std::vector measurement_field_names; +}; + +/// @} + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/order_by_impl.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/order_by_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..9b5a0f69a69ffc8f23fb5416e82777d2d06f0a00 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/order_by_impl.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/acero/options.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" + +namespace arrow { + +using compute::ExecContext; + +namespace acero { + +class OrderByImpl { + public: + virtual ~OrderByImpl() = default; + + virtual void InputReceived(const std::shared_ptr& batch) = 0; + + virtual Result DoFinish() = 0; + + virtual std::string ToString() const = 0; + + static Result> MakeSort( + ExecContext* ctx, const std::shared_ptr& output_schema, + const SortOptions& options); + + static Result> MakeSelectK( + ExecContext* ctx, const std::shared_ptr& output_schema, + const SelectKOptions& options); +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/partition_util.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/partition_util.h new file mode 100644 index 0000000000000000000000000000000000000000..1413a8326ade01fc264c4800d83d2df85db59acd --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/partition_util.h @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "arrow/acero/util.h" +#include "arrow/buffer.h" +#include "arrow/util/pcg_random.h" + +namespace arrow { +namespace acero { + +class PartitionSort { + public: + /// \brief Bucket sort rows on partition ids in O(num_rows) time. + /// + /// Include in the output exclusive cumulative sum of bucket sizes. + /// This corresponds to ranges in the sorted array containing all row ids for + /// each of the partitions. + /// + /// prtn_ranges must be initialized and have at least prtn_ranges + 1 elements + /// when this method returns prtn_ranges[i] will contains the total number of + /// elements in partitions 0 through i. prtn_ranges[0] will be 0. + /// + /// prtn_id_impl must be a function that takes in a row id (int) and returns + /// a partition id (int). The returned partition id must be between 0 and + /// num_prtns (exclusive). + /// + /// output_pos_impl is a function that takes in a row id (int) and a position (int) + /// in the bucket sorted output. The function should insert the row in the + /// output. + /// + /// For example: + /// + /// in_arr: [5, 7, 2, 3, 5, 4] + /// num_prtns: 3 + /// prtn_id_impl: [&in_arr] (int row_id) { return in_arr[row_id] / 3; } + /// output_pos_impl: [&out_arr] (int row_id, int pos) { out_arr[pos] = row_id; } + /// + /// After Execution + /// out_arr: [2, 5, 3, 5, 4, 7] + /// prtn_ranges: [0, 1, 5, 6] + template + static void Eval(int64_t num_rows, int num_prtns, uint16_t* prtn_ranges, + INPUT_PRTN_ID_FN prtn_id_impl, OUTPUT_POS_FN output_pos_impl) { + ARROW_DCHECK(num_rows > 0 && num_rows <= (1 << 15)); + ARROW_DCHECK(num_prtns >= 1 && num_prtns <= (1 << 15)); + + memset(prtn_ranges, 0, (num_prtns + 1) * sizeof(uint16_t)); + + for (int64_t i = 0; i < num_rows; ++i) { + int prtn_id = static_cast(prtn_id_impl(i)); + ++prtn_ranges[prtn_id + 1]; + } + + uint16_t sum = 0; + for (int i = 0; i < num_prtns; ++i) { + uint16_t sum_next = sum + prtn_ranges[i + 1]; + prtn_ranges[i + 1] = sum; + sum = sum_next; + } + + for (int64_t i = 0; i < num_rows; ++i) { + int prtn_id = static_cast(prtn_id_impl(i)); + int pos = prtn_ranges[prtn_id + 1]++; + output_pos_impl(i, pos); + } + } +}; + +/// \brief A control for synchronizing threads on a partitionable workload +class PartitionLocks { + public: + PartitionLocks(); + ~PartitionLocks(); + /// \brief Initializes the control, must be called before use + /// + /// \param num_threads Maximum number of threads that will access the partitions + /// \param num_prtns Number of partitions to synchronize + void Init(size_t num_threads, int num_prtns); + /// \brief Cleans up the control, it should not be used after this call + void CleanUp(); + /// \brief Acquire a partition to work on one + /// + /// \param thread_id The index of the thread trying to acquire the partition lock + /// \param num_prtns Length of prtns_to_try, must be <= num_prtns used in Init + /// \param prtns_to_try An array of partitions that still have remaining work + /// \param limit_retries If false, this method will spinwait forever until success + /// \param max_retries Max times to attempt checking out work before returning false + /// \param[out] locked_prtn_id The id of the partition locked + /// \param[out] locked_prtn_id_pos The index of the partition locked in prtns_to_try + /// \return True if a partition was locked, false if max_retries was attempted + /// without successfully acquiring a lock + /// + /// This method is thread safe + bool AcquirePartitionLock(size_t thread_id, int num_prtns, const int* prtns_to_try, + bool limit_retries, int max_retries, int* locked_prtn_id, + int* locked_prtn_id_pos); + /// \brief Release a partition so that other threads can work on it + void ReleasePartitionLock(int prtn_id); + + // Executes (synchronously and using current thread) the same operation on a set of + // multiple partitions. Tries to minimize partition locking overhead by randomizing and + // adjusting order in which partitions are processed. + // + // PROCESS_PRTN_FN is a callback which will be executed for each partition after + // acquiring the lock for that partition. It gets partition id as an argument. + // IS_PRTN_EMPTY_FN is a callback which filters out (when returning true) partitions + // with specific ids from processing. + // + template + Status ForEachPartition(size_t thread_id, + /*scratch space buffer with space for one element per partition; + dirty in and dirty out*/ + int* temp_unprocessed_prtns, IS_PRTN_EMPTY_FN is_prtn_empty_fn, + PROCESS_PRTN_FN process_prtn_fn) { + int num_unprocessed_partitions = 0; + for (int i = 0; i < num_prtns_; ++i) { + bool is_prtn_empty = is_prtn_empty_fn(i); + if (!is_prtn_empty) { + temp_unprocessed_prtns[num_unprocessed_partitions++] = i; + } + } + while (num_unprocessed_partitions > 0) { + int locked_prtn_id; + int locked_prtn_id_pos; + AcquirePartitionLock(thread_id, num_unprocessed_partitions, temp_unprocessed_prtns, + /*limit_retries=*/false, /*max_retries=*/-1, &locked_prtn_id, + &locked_prtn_id_pos); + { + class AutoReleaseLock { + public: + AutoReleaseLock(PartitionLocks* locks, int prtn_id) + : locks(locks), prtn_id(prtn_id) {} + ~AutoReleaseLock() { locks->ReleasePartitionLock(prtn_id); } + PartitionLocks* locks; + int prtn_id; + } auto_release_lock(this, locked_prtn_id); + ARROW_RETURN_NOT_OK(process_prtn_fn(locked_prtn_id)); + } + if (locked_prtn_id_pos < num_unprocessed_partitions - 1) { + temp_unprocessed_prtns[locked_prtn_id_pos] = + temp_unprocessed_prtns[num_unprocessed_partitions - 1]; + } + --num_unprocessed_partitions; + } + return Status::OK(); + } + + private: + std::atomic* lock_ptr(int prtn_id); + int random_int(size_t thread_id, int num_values); + + struct PartitionLock { + static constexpr int kCacheLineBytes = 64; + std::atomic lock; + uint8_t padding[kCacheLineBytes]; + }; + int num_prtns_; + std::unique_ptr locks_; + std::unique_ptr rngs_; +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/pch.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/pch.h new file mode 100644 index 0000000000000000000000000000000000000000..ddb4c120f2a877ffb794b8443f8af1f7707d2cf6 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/pch.h @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Often-used headers, for precompiling. +// If updating this header, please make sure you check compilation speed +// before checking in. Adding headers which are not used extremely often +// may incur a slowdown, since it makes the precompiled header heavier to load. + +#include "arrow/pch.h" diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/query_context.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/query_context.h new file mode 100644 index 0000000000000000000000000000000000000000..3eff299439828e602558e5ebc278660bb7ce37eb --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/query_context.h @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include + +#include "arrow/acero/exec_plan.h" +#include "arrow/acero/task_util.h" +#include "arrow/acero/util.h" +#include "arrow/compute/exec.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/async_util.h" +#include "arrow/util/type_fwd.h" + +namespace arrow { + +using compute::default_exec_context; +using io::IOContext; + +namespace acero { + +class ARROW_ACERO_EXPORT QueryContext { + public: + QueryContext(QueryOptions opts = {}, + ExecContext exec_context = *default_exec_context()); + + Status Init(arrow::util::AsyncTaskScheduler* scheduler); + + const ::arrow::internal::CpuInfo* cpu_info() const; + int64_t hardware_flags() const; + const QueryOptions& options() const { return options_; } + MemoryPool* memory_pool() const { return exec_context_.memory_pool(); } + ::arrow::internal::Executor* executor() const { return exec_context_.executor(); } + ExecContext* exec_context() { return &exec_context_; } + IOContext* io_context() { return &io_context_; } + TaskScheduler* scheduler() { return task_scheduler_.get(); } + arrow::util::AsyncTaskScheduler* async_scheduler() { return async_scheduler_; } + + size_t GetThreadIndex(); + size_t max_concurrency() const; + + /// \brief Start an external task + /// + /// This should be avoided if possible. It is kept in for now for legacy + /// purposes. This should be called before the external task is started. If + /// a valid future is returned then it should be marked complete when the + /// external task has finished. + /// + /// \param name A name to give the task for traceability and debugging + /// + /// \return an invalid future if the plan has already ended, otherwise this + /// returns a future that must be completed when the external task + /// finishes. + Result> BeginExternalTask(std::string_view name); + + /// \brief Add a single function as a task to the query's task group + /// on the compute threadpool. + /// + /// \param fn The task to run. Takes no arguments and returns a Status. + /// \param name A name to give the task for traceability and debugging + void ScheduleTask(std::function fn, std::string_view name); + /// \brief Add a single function as a task to the query's task group + /// on the compute threadpool. + /// + /// \param fn The task to run. Takes the thread index and returns a Status. + /// \param name A name to give the task for traceability and debugging + void ScheduleTask(std::function fn, std::string_view name); + /// \brief Add a single function as a task to the query's task group on + /// the IO thread pool + /// + /// \param fn The task to run. Returns a status. + /// \param name A name to give the task for traceability and debugging + void ScheduleIOTask(std::function fn, std::string_view name); + + // Register/Start TaskGroup is a way of performing a "Parallel For" pattern: + // - The task function takes the thread index and the index of the task + // - The on_finished function takes the thread index + // Returns an integer ID that will be used to reference the task group in + // StartTaskGroup. At runtime, call StartTaskGroup with the ID and the number of times + // you'd like the task to be executed. The need to register a task group before use will + // be removed after we rewrite the scheduler. + /// \brief Register a "parallel for" task group with the scheduler + /// + /// \param task The function implementing the task. Takes the thread_index and + /// the task index. + /// \param on_finished The function that gets run once all tasks have been completed. + /// Takes the thread_index. + /// + /// Must be called inside of ExecNode::Init. + int RegisterTaskGroup(std::function task, + std::function on_finished); + + /// \brief Start the task group with the specified ID. This can only + /// be called once per task_group_id. + /// + /// \param task_group_id The ID of the task group to run + /// \param num_tasks The number of times to run the task + Status StartTaskGroup(int task_group_id, int64_t num_tasks); + + // This is an RAII class for keeping track of in-flight file IO. Useful for getting + // an estimate of memory use, and how much memory we expect to be freed soon. + // Returned by ReportTempFileIO. + struct [[nodiscard]] TempFileIOMark { + QueryContext* ctx_; + size_t bytes_; + + TempFileIOMark(QueryContext* ctx, size_t bytes) : ctx_(ctx), bytes_(bytes) { + ctx_->in_flight_bytes_to_disk_.fetch_add(bytes_, std::memory_order_acquire); + } + + ARROW_DISALLOW_COPY_AND_ASSIGN(TempFileIOMark); + + ~TempFileIOMark() { + ctx_->in_flight_bytes_to_disk_.fetch_sub(bytes_, std::memory_order_release); + } + }; + + TempFileIOMark ReportTempFileIO(size_t bytes) { return {this, bytes}; } + + size_t GetCurrentTempFileIO() { return in_flight_bytes_to_disk_.load(); } + + private: + QueryOptions options_; + // To be replaced with Acero-specific context once scheduler is done and + // we don't need ExecContext for kernels + ExecContext exec_context_; + IOContext io_context_; + + arrow::util::AsyncTaskScheduler* async_scheduler_ = NULLPTR; + std::unique_ptr task_scheduler_ = TaskScheduler::Make(); + + ThreadIndexer thread_indexer_; + + std::atomic in_flight_bytes_to_disk_{0}; +}; +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/schema_util.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/schema_util.h new file mode 100644 index 0000000000000000000000000000000000000000..db3076a58841a6cb85fcc3d5033ef3b74ed18898 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/schema_util.h @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/type.h" // for DataType, FieldRef, Field and Schema + +namespace arrow { + +using internal::checked_cast; + +namespace acero { + +// Identifiers for all different row schemas that are used in a join +// +enum class HashJoinProjection : int { + INPUT = 0, + KEY = 1, + PAYLOAD = 2, + FILTER = 3, + OUTPUT = 4 +}; + +struct SchemaProjectionMap { + static constexpr int kMissingField = -1; + int num_cols; + const int* source_to_base; + const int* base_to_target; + inline int get(int i) const { + assert(i >= 0 && i < num_cols); + assert(source_to_base[i] != kMissingField); + return base_to_target[source_to_base[i]]; + } +}; + +/// Helper class for managing different projections of the same row schema. +/// Used to efficiently map any field in one projection to a corresponding field in +/// another projection. +/// Materialized mappings are generated lazily at the time of the first access. +/// Thread-safe apart from initialization. +template +class SchemaProjectionMaps { + public: + static constexpr int kMissingField = -1; + + Status Init(ProjectionIdEnum full_schema_handle, const Schema& schema, + const std::vector& projection_handles, + const std::vector*>& projections) { + assert(projection_handles.size() == projections.size()); + ARROW_RETURN_NOT_OK(RegisterSchema(full_schema_handle, schema)); + for (size_t i = 0; i < projections.size(); ++i) { + ARROW_RETURN_NOT_OK( + RegisterProjectedSchema(projection_handles[i], *(projections[i]), schema)); + } + RegisterEnd(); + return Status::OK(); + } + + int num_cols(ProjectionIdEnum schema_handle) const { + int id = schema_id(schema_handle); + return static_cast(schemas_[id].second.data_types.size()); + } + + bool is_empty(ProjectionIdEnum schema_handle) const { + return num_cols(schema_handle) == 0; + } + + const std::string& field_name(ProjectionIdEnum schema_handle, int field_id) const { + int id = schema_id(schema_handle); + return schemas_[id].second.field_names[field_id]; + } + + const std::shared_ptr& data_type(ProjectionIdEnum schema_handle, + int field_id) const { + int id = schema_id(schema_handle); + return schemas_[id].second.data_types[field_id]; + } + + const std::vector>& data_types( + ProjectionIdEnum schema_handle) const { + int id = schema_id(schema_handle); + return schemas_[id].second.data_types; + } + + SchemaProjectionMap map(ProjectionIdEnum from, ProjectionIdEnum to) const { + int id_from = schema_id(from); + int id_to = schema_id(to); + SchemaProjectionMap result; + result.num_cols = num_cols(from); + result.source_to_base = mappings_[id_from].data(); + result.base_to_target = inverse_mappings_[id_to].data(); + return result; + } + + protected: + struct FieldInfos { + std::vector field_paths; + std::vector field_names; + std::vector> data_types; + }; + + Status RegisterSchema(ProjectionIdEnum handle, const Schema& schema) { + FieldInfos out_fields; + const FieldVector& in_fields = schema.fields(); + out_fields.field_paths.resize(in_fields.size()); + out_fields.field_names.resize(in_fields.size()); + out_fields.data_types.resize(in_fields.size()); + for (size_t i = 0; i < in_fields.size(); ++i) { + const std::string& name = in_fields[i]->name(); + const std::shared_ptr& type = in_fields[i]->type(); + out_fields.field_paths[i] = static_cast(i); + out_fields.field_names[i] = name; + out_fields.data_types[i] = type; + } + schemas_.push_back(std::make_pair(handle, out_fields)); + return Status::OK(); + } + + Status RegisterProjectedSchema(ProjectionIdEnum handle, + const std::vector& selected_fields, + const Schema& full_schema) { + FieldInfos out_fields; + const FieldVector& in_fields = full_schema.fields(); + out_fields.field_paths.resize(selected_fields.size()); + out_fields.field_names.resize(selected_fields.size()); + out_fields.data_types.resize(selected_fields.size()); + for (size_t i = 0; i < selected_fields.size(); ++i) { + // All fields must be found in schema without ambiguity + ARROW_ASSIGN_OR_RAISE(auto match, selected_fields[i].FindOne(full_schema)); + const std::string& name = in_fields[match[0]]->name(); + const std::shared_ptr& type = in_fields[match[0]]->type(); + out_fields.field_paths[i] = match[0]; + out_fields.field_names[i] = name; + out_fields.data_types[i] = type; + } + schemas_.push_back(std::make_pair(handle, out_fields)); + return Status::OK(); + } + + void RegisterEnd() { + size_t size = schemas_.size(); + mappings_.resize(size); + inverse_mappings_.resize(size); + int id_base = 0; + for (size_t i = 0; i < size; ++i) { + GenerateMapForProjection(static_cast(i), id_base); + } + } + + int schema_id(ProjectionIdEnum schema_handle) const { + for (size_t i = 0; i < schemas_.size(); ++i) { + if (schemas_[i].first == schema_handle) { + return static_cast(i); + } + } + // We should never get here + assert(false); + return -1; + } + + void GenerateMapForProjection(int id_proj, int id_base) { + int num_cols_proj = static_cast(schemas_[id_proj].second.data_types.size()); + int num_cols_base = static_cast(schemas_[id_base].second.data_types.size()); + + std::vector& mapping = mappings_[id_proj]; + std::vector& inverse_mapping = inverse_mappings_[id_proj]; + mapping.resize(num_cols_proj); + inverse_mapping.resize(num_cols_base); + + if (id_proj == id_base) { + for (int i = 0; i < num_cols_base; ++i) { + mapping[i] = inverse_mapping[i] = i; + } + } else { + const FieldInfos& fields_proj = schemas_[id_proj].second; + const FieldInfos& fields_base = schemas_[id_base].second; + for (int i = 0; i < num_cols_base; ++i) { + inverse_mapping[i] = SchemaProjectionMap::kMissingField; + } + for (int i = 0; i < num_cols_proj; ++i) { + int field_id = SchemaProjectionMap::kMissingField; + for (int j = 0; j < num_cols_base; ++j) { + if (fields_proj.field_paths[i] == fields_base.field_paths[j]) { + field_id = j; + // If there are multiple matches for the same input field, + // it will be mapped to the first match. + break; + } + } + assert(field_id != SchemaProjectionMap::kMissingField); + mapping[i] = field_id; + inverse_mapping[field_id] = i; + } + } + } + + // vector used as a mapping from ProjectionIdEnum to fields + std::vector> schemas_; + std::vector> mappings_; + std::vector> inverse_mappings_; +}; + +using HashJoinProjectionMaps = SchemaProjectionMaps; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/task_util.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/task_util.h new file mode 100644 index 0000000000000000000000000000000000000000..fbd4af699d12795bd92bd385f23a036d63adde38 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/task_util.h @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/acero/visibility.h" +#include "arrow/status.h" +#include "arrow/util/config.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace acero { + +// Atomic value surrounded by padding bytes to avoid cache line invalidation +// whenever it is modified by a concurrent thread on a different CPU core. +// +template +class AtomicWithPadding { + private: + static constexpr int kCacheLineSize = 64; + uint8_t padding_before[kCacheLineSize]; + + public: + std::atomic value; + + private: + uint8_t padding_after[kCacheLineSize]; +}; + +// Used for asynchronous execution of operations that can be broken into +// a fixed number of symmetric tasks that can be executed concurrently. +// +// Implements priorities between multiple such operations, called task groups. +// +// Allows to specify the maximum number of in-flight tasks at any moment. +// +// Also allows for executing next pending tasks immediately using a caller thread. +// +class ARROW_ACERO_EXPORT TaskScheduler { + public: + using TaskImpl = std::function; + using TaskGroupContinuationImpl = std::function; + using ScheduleImpl = std::function; + using AbortContinuationImpl = std::function; + + virtual ~TaskScheduler() = default; + + // Order in which task groups are registered represents priorities of their tasks + // (the first group has the highest priority). + // + // Returns task group identifier that is used to request operations on the task group. + virtual int RegisterTaskGroup(TaskImpl task_impl, + TaskGroupContinuationImpl cont_impl) = 0; + + virtual void RegisterEnd() = 0; + + // total_num_tasks may be zero, in which case task group continuation will be executed + // immediately + virtual Status StartTaskGroup(size_t thread_id, int group_id, + int64_t total_num_tasks) = 0; + + // Execute given number of tasks immediately using caller thread + virtual Status ExecuteMore(size_t thread_id, int num_tasks_to_execute, + bool execute_all) = 0; + + // Begin scheduling tasks using provided callback and + // the limit on the number of in-flight tasks at any moment. + // + // Scheduling will continue as long as there are waiting tasks. + // + // It will automatically resume whenever new task group gets started. + virtual Status StartScheduling(size_t thread_id, ScheduleImpl schedule_impl, + int num_concurrent_tasks, bool use_sync_execution) = 0; + + // Abort scheduling and execution. + // Used in case of being notified about unrecoverable error for the entire query. + virtual void Abort(AbortContinuationImpl impl) = 0; + + static std::unique_ptr Make(); +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/test_nodes.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/test_nodes.h new file mode 100644 index 0000000000000000000000000000000000000000..7e31aa31b34d7b423ab85ff2e77c1cec0087fa5b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/test_nodes.h @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/acero/options.h" +#include "arrow/acero/test_util_internal.h" +#include "arrow/testing/random.h" + +namespace arrow { +namespace acero { + +// \brief Make a delaying source that is optionally noisy (prints when it emits) +AsyncGenerator> MakeDelayedGen( + Iterator> src, std::string label, double delay_sec, + bool noisy = false); + +// \brief Make a delaying source that is optionally noisy (prints when it emits) +AsyncGenerator> MakeDelayedGen( + AsyncGenerator> src, std::string label, double delay_sec, + bool noisy = false); + +// \brief Make a delaying source that is optionally noisy (prints when it emits) +AsyncGenerator> MakeDelayedGen(BatchesWithSchema src, + std::string label, + double delay_sec, + bool noisy = false); + +/// A node that slightly resequences the input at random +struct JitterNodeOptions : public ExecNodeOptions { + random::SeedType seed; + /// The max amount to add to a node's "cost". + int max_jitter_modifier; + + explicit JitterNodeOptions(random::SeedType seed, int max_jitter_modifier = 5) + : seed(seed), max_jitter_modifier(max_jitter_modifier) {} + static constexpr std::string_view kName = "jitter"; +}; + +class GateImpl; + +class Gate { + public: + static std::shared_ptr Make(); + + Gate(); + virtual ~Gate(); + + void ReleaseAllBatches(); + void ReleaseOneBatch(); + Future<> WaitForNextReleasedBatch(); + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Gate); + + GateImpl* impl_; +}; + +// A node that holds all input batches until a given gate is released +struct GatedNodeOptions : public ExecNodeOptions { + explicit GatedNodeOptions(Gate* gate) : gate(gate) {} + Gate* gate; + + static constexpr std::string_view kName = "gated"; +}; + +void RegisterTestNodes(); + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/time_series_util.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/time_series_util.h new file mode 100644 index 0000000000000000000000000000000000000000..97707f43bf20b95387f463a9c07e37f54c33998c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/time_series_util.h @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/record_batch.h" +#include "arrow/type_traits.h" + +namespace arrow::acero { + +// normalize the value to unsigned 64-bits while preserving ordering of values +template ::value, bool> = true> +uint64_t NormalizeTime(T t); + +uint64_t GetTime(const RecordBatch* batch, Type::type time_type, int col, uint64_t row); + +} // namespace arrow::acero diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/tpch_node.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/tpch_node.h new file mode 100644 index 0000000000000000000000000000000000000000..e6476b57ad6b4108af56777c029d932f4af94726 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/tpch_node.h @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/acero/type_fwd.h" +#include "arrow/acero/visibility.h" +#include "arrow/result.h" +#include "arrow/status.h" + +namespace arrow { +namespace acero { +namespace internal { + +class ARROW_ACERO_EXPORT TpchGen { + public: + virtual ~TpchGen() = default; + + /* + * \brief Create a factory for nodes that generate TPC-H data + * + * Note: Individual tables will reference each other. It is important that you only + * create a single TpchGen instance for each plan and then you can create nodes for each + * table from that single TpchGen instance. Note: Every batch will be scheduled as a new + * task using the ExecPlan's scheduler. + */ + static Result> Make( + ExecPlan* plan, double scale_factor = 1.0, int64_t batch_size = 4096, + std::optional seed = std::nullopt); + + // The below methods will create and add an ExecNode to the plan that generates + // data for the desired table. If columns is empty, all columns will be generated. + // The methods return the added ExecNode, which should be used for inputs. + virtual Result Supplier(std::vector columns = {}) = 0; + virtual Result Part(std::vector columns = {}) = 0; + virtual Result PartSupp(std::vector columns = {}) = 0; + virtual Result Customer(std::vector columns = {}) = 0; + virtual Result Orders(std::vector columns = {}) = 0; + virtual Result Lineitem(std::vector columns = {}) = 0; + virtual Result Nation(std::vector columns = {}) = 0; + virtual Result Region(std::vector columns = {}) = 0; +}; + +} // namespace internal +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/type_fwd.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..f0410de9f7830a7d0e55a04eb514ae9d82e6958c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/type_fwd.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/compute/type_fwd.h" + +namespace arrow { + +namespace acero { + +class ExecNode; +class ExecPlan; +class ExecNodeOptions; +class ExecFactoryRegistry; +class QueryContext; +struct QueryOptions; +struct Declaration; +class SinkNodeConsumer; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/util.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/util.h new file mode 100644 index 0000000000000000000000000000000000000000..ee46e8527422abae4f97804058639593dd6b159c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/util.h @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/acero/options.h" +#include "arrow/acero/type_fwd.h" +#include "arrow/buffer.h" +#include "arrow/compute/expression.h" +#include "arrow/compute/util.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/cpu_info.h" +#include "arrow/util/logging.h" +#include "arrow/util/mutex.h" +#include "arrow/util/thread_pool.h" +#include "arrow/util/type_fwd.h" + +namespace arrow { + +namespace acero { + +ARROW_ACERO_EXPORT +Status ValidateExecNodeInputs(ExecPlan* plan, const std::vector& inputs, + int expected_num_inputs, const char* kind_name); + +ARROW_ACERO_EXPORT +Result> TableFromExecBatches( + const std::shared_ptr& schema, const std::vector& exec_batches); + +class ARROW_ACERO_EXPORT AtomicCounter { + public: + AtomicCounter() = default; + + int count() const { return count_.load(); } + + std::optional total() const { + int total = total_.load(); + if (total == -1) return {}; + return total; + } + + // return true if the counter is complete + bool Increment() { + ARROW_DCHECK_NE(count_.load(), total_.load()); + int count = count_.fetch_add(1) + 1; + if (count != total_.load()) return false; + return DoneOnce(); + } + + // return true if the counter is complete + bool SetTotal(int total) { + total_.store(total); + if (count_.load() != total) return false; + return DoneOnce(); + } + + // return true if the counter has not already been completed + bool Cancel() { return DoneOnce(); } + + // return true if the counter has finished or been cancelled + bool Completed() { return complete_.load(); } + + private: + // ensure there is only one true return from Increment(), SetTotal(), or Cancel() + bool DoneOnce() { + bool expected = false; + return complete_.compare_exchange_strong(expected, true); + } + + std::atomic count_{0}, total_{-1}; + std::atomic complete_{false}; +}; + +class ARROW_ACERO_EXPORT ThreadIndexer { + public: + size_t operator()(); + + static size_t Capacity(); + + private: + static size_t Check(size_t thread_index); + + arrow::util::Mutex mutex_; + std::unordered_map id_to_index_; +}; + +/// \brief A consumer that collects results into an in-memory table +struct ARROW_ACERO_EXPORT TableSinkNodeConsumer : public SinkNodeConsumer { + public: + TableSinkNodeConsumer(std::shared_ptr
* out, MemoryPool* pool) + : out_(out), pool_(pool) {} + Status Init(const std::shared_ptr& schema, + BackpressureControl* backpressure_control, ExecPlan* plan) override; + Status Consume(ExecBatch batch) override; + Future<> Finish() override; + + private: + std::shared_ptr
* out_; + MemoryPool* pool_; + std::shared_ptr schema_; + std::vector> batches_; + arrow::util::Mutex consume_mutex_; +}; + +class ARROW_ACERO_EXPORT NullSinkNodeConsumer : public SinkNodeConsumer { + public: + Status Init(const std::shared_ptr&, BackpressureControl*, + ExecPlan* plan) override { + return Status::OK(); + } + Status Consume(ExecBatch exec_batch) override { return Status::OK(); } + Future<> Finish() override { return Status::OK(); } + + public: + static std::shared_ptr Make() { + return std::make_shared(); + } +}; + +/// CRTP helper for tracing helper functions + +class ARROW_ACERO_EXPORT TracedNode { + public: + // All nodes should call TraceStartProducing or NoteStartProducing exactly once + // Most nodes will be fine with a call to NoteStartProducing since the StartProducing + // call is usually fairly cheap and simply schedules tasks to fetch the actual data. + + explicit TracedNode(ExecNode* node) : node_(node) {} + + // Create a span to record the StartProducing work + [[nodiscard]] ::arrow::internal::tracing::Scope TraceStartProducing( + std::string extra_details) const; + + // Record a call to StartProducing without creating with a span + void NoteStartProducing(std::string extra_details) const; + + // All nodes should call TraceInputReceived for each batch they receive. This call + // should track the time spent processing the batch. NoteInputReceived is available + // but usually won't be used unless a node is simply adding batches to a trivial queue. + + // Create a span to record the InputReceived work + [[nodiscard]] ::arrow::internal::tracing::Scope TraceInputReceived( + const ExecBatch& batch) const; + + // Record a call to InputReceived without creating with a span + void NoteInputReceived(const ExecBatch& batch) const; + + // Create a span to record any "finish" work. This should NOT be called as part of + // InputFinished and many nodes may not need to call this at all. This should be used + // when a node has some extra work that has to be done once it has received all of its + // data. For example, an aggregation node calculating aggregations. This will + // typically be called as a result of InputFinished OR InputReceived. + [[nodiscard]] ::arrow::internal::tracing::Scope TraceFinish() const; + + private: + ExecNode* node_; +}; + +} // namespace acero +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/visibility.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/visibility.h new file mode 100644 index 0000000000000000000000000000000000000000..21a697a56eca962602b34b2766d74442d185c3d7 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/acero/visibility.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4251) +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif + +# ifdef ARROW_ACERO_STATIC +# define ARROW_ACERO_EXPORT +# elif defined(ARROW_ACERO_EXPORTING) +# define ARROW_ACERO_EXPORT __declspec(dllexport) +# else +# define ARROW_ACERO_EXPORT __declspec(dllimport) +# endif + +# define ARROW_ACERO_NO_EXPORT +#else // Not Windows +# ifndef ARROW_ACERO_EXPORT +# define ARROW_ACERO_EXPORT __attribute__((visibility("default"))) +# endif +# ifndef ARROW_ACERO_NO_EXPORT +# define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden"))) +# endif +#endif // Not-Windows + +#if defined(_MSC_VER) +# pragma warning(pop) +#endif diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/orc/adapter.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/orc/adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..4ffff81f355f1ddcdc19516746c61b8021477de4 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/orc/adapter.h @@ -0,0 +1,323 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/adapters/orc/options.h" +#include "arrow/io/interfaces.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace adapters { +namespace orc { + +/// \brief Information about an ORC stripe +struct StripeInformation { + /// \brief Offset of the stripe from the start of the file, in bytes + int64_t offset; + /// \brief Length of the stripe, in bytes + int64_t length; + /// \brief Number of rows in the stripe + int64_t num_rows; + /// \brief Index of the first row of the stripe + int64_t first_row_id; +}; + +/// \class ORCFileReader +/// \brief Read an Arrow Table or RecordBatch from an ORC file. +class ARROW_EXPORT ORCFileReader { + public: + ~ORCFileReader(); + + /// \brief Creates a new ORC reader + /// + /// \param[in] file the data source + /// \param[in] pool a MemoryPool to use for buffer allocations + /// \return the returned reader object + static Result> Open( + const std::shared_ptr& file, MemoryPool* pool); + + /// \brief Return the schema read from the ORC file + /// + /// \return the returned Schema object + Result> ReadSchema(); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \return the returned Table + Result> Read(); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] schema the Table schema + /// \return the returned Table + Result> Read(const std::shared_ptr& schema); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] include_indices the selected field indices to read + /// \return the returned Table + Result> Read(const std::vector& include_indices); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] include_names the selected field names to read + /// \return the returned Table + Result> Read(const std::vector& include_names); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] schema the Table schema + /// \param[in] include_indices the selected field indices to read + /// \return the returned Table + Result> Read(const std::shared_ptr& schema, + const std::vector& include_indices); + + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \return the returned RecordBatch + Result> ReadStripe(int64_t stripe); + + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \param[in] include_indices the selected field indices to read + /// \return the returned RecordBatch + Result> ReadStripe( + int64_t stripe, const std::vector& include_indices); + + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \param[in] include_names the selected field names to read + /// \return the returned RecordBatch + Result> ReadStripe( + int64_t stripe, const std::vector& include_names); + + /// \brief Seek to designated row. Invoke NextStripeReader() after seek + /// will return stripe reader starting from designated row. + /// + /// \param[in] row_number the rows number to seek + Status Seek(int64_t row_number); + + /// \brief Get a stripe level record batch iterator. + /// + /// Each record batch will have up to `batch_size` rows. + /// NextStripeReader serves as a fine-grained alternative to ReadStripe + /// which may cause OOM issues by loading the whole stripe into memory. + /// + /// Note this will only read rows for the current stripe, not the entire + /// file. + /// + /// \param[in] batch_size the maximum number of rows in each record batch + /// \return the returned stripe reader + Result> NextStripeReader(int64_t batch_size); + + /// \brief Get a stripe level record batch iterator. + /// + /// Each record batch will have up to `batch_size` rows. + /// NextStripeReader serves as a fine-grained alternative to ReadStripe + /// which may cause OOM issues by loading the whole stripe into memory. + /// + /// Note this will only read rows for the current stripe, not the entire + /// file. + /// + /// \param[in] batch_size the maximum number of rows in each record batch + /// \param[in] include_indices the selected field indices to read + /// \return the stripe reader + Result> NextStripeReader( + int64_t batch_size, const std::vector& include_indices); + + /// \brief Get a record batch iterator for the entire file. + /// + /// Each record batch will have up to `batch_size` rows. + /// + /// \param[in] batch_size the maximum number of rows in each record batch + /// \param[in] include_names the selected field names to read, if not empty + /// (otherwise all fields are read) + /// \return the record batch iterator + Result> GetRecordBatchReader( + int64_t batch_size, const std::vector& include_names); + + /// \brief The number of stripes in the file + int64_t NumberOfStripes(); + + /// \brief The number of rows in the file + int64_t NumberOfRows(); + + /// \brief StripeInformation for each stripe. + StripeInformation GetStripeInformation(int64_t stripe); + + /// \brief Get the format version of the file. + /// Currently known values are 0.11 and 0.12. + /// + /// \return The FileVersion of the ORC file. + FileVersion GetFileVersion(); + + /// \brief Get the software instance and version that wrote this file. + /// + /// \return a user-facing string that specifies the software version + std::string GetSoftwareVersion(); + + /// \brief Get the compression kind of the file. + /// + /// \return The kind of compression in the ORC file. + Result GetCompression(); + + /// \brief Get the buffer size for the compression. + /// + /// \return Number of bytes to buffer for the compression codec. + int64_t GetCompressionSize(); + + /// \brief Get the number of rows per an entry in the row index. + /// \return the number of rows per an entry in the row index or 0 if there + /// is no row index. + int64_t GetRowIndexStride(); + + /// \brief Get ID of writer that generated the file. + /// + /// \return UNKNOWN_WRITER if the writer ID is undefined + WriterId GetWriterId(); + + /// \brief Get the writer id value when getWriterId() returns an unknown writer. + /// + /// \return the integer value of the writer ID. + int32_t GetWriterIdValue(); + + /// \brief Get the version of the writer. + /// + /// \return the version of the writer. + + WriterVersion GetWriterVersion(); + + /// \brief Get the number of stripe statistics in the file. + /// + /// \return the number of stripe statistics + int64_t GetNumberOfStripeStatistics(); + + /// \brief Get the length of the data stripes in the file. + /// + /// \return return the number of bytes in stripes + int64_t GetContentLength(); + + /// \brief Get the length of the file stripe statistics. + /// + /// \return the number of compressed bytes in the file stripe statistics + int64_t GetStripeStatisticsLength(); + + /// \brief Get the length of the file footer. + /// + /// \return the number of compressed bytes in the file footer + int64_t GetFileFooterLength(); + + /// \brief Get the length of the file postscript. + /// + /// \return the number of bytes in the file postscript + int64_t GetFilePostscriptLength(); + + /// \brief Get the total length of the file. + /// + /// \return the number of bytes in the file + int64_t GetFileLength(); + + /// \brief Get the serialized file tail. + /// Useful if another reader of the same file wants to avoid re-reading + /// the file tail. See ReadOptions.SetSerializedFileTail(). + /// + /// \return a string of bytes with the file tail + std::string GetSerializedFileTail(); + + /// \brief Return the metadata read from the ORC file + /// + /// \return A KeyValueMetadata object containing the ORC metadata + Result> ReadMetadata(); + + private: + class Impl; + std::unique_ptr impl_; + ORCFileReader(); +}; + +/// \class ORCFileWriter +/// \brief Write an Arrow Table or RecordBatch to an ORC file. +class ARROW_EXPORT ORCFileWriter { + public: + ~ORCFileWriter(); + /// \brief Creates a new ORC writer. + /// + /// \param[in] output_stream a pointer to the io::OutputStream to write into + /// \param[in] write_options the ORC writer options for Arrow + /// \return the returned writer object + static Result> Open( + io::OutputStream* output_stream, + const WriteOptions& write_options = WriteOptions()); + + /// \brief Write a table. This can be called multiple times. + /// + /// Tables passed in subsequent calls must match the schema of the table that was + /// written first. + /// + /// \param[in] table the Arrow table from which data is extracted. + /// \return Status + Status Write(const Table& table); + + /// \brief Write a RecordBatch. This can be called multiple times. + /// + /// RecordBatches passed in subsequent calls must match the schema of the + /// RecordBatch that was written first. + /// + /// \param[in] record_batch the Arrow RecordBatch from which data is extracted. + /// \return Status + Status Write(const RecordBatch& record_batch); + + /// \brief Close an ORC writer (orc::Writer) + /// + /// \return Status + Status Close(); + + private: + class Impl; + std::unique_ptr impl_; + + private: + ORCFileWriter(); +}; + +} // namespace orc +} // namespace adapters +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/orc/options.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/orc/options.h new file mode 100644 index 0000000000000000000000000000000000000000..3a300da678db98c24949203be7ab471a57502640 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/orc/options.h @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/io/interfaces.h" +#include "arrow/status.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace adapters { + +namespace orc { + +enum class WriterId : int32_t { + kOrcJava = 0, + kOrcCpp = 1, + kPresto = 2, + kScritchleyGo = 3, + kTrino = 4, + kUnknown = INT32_MAX +}; + +enum class WriterVersion : int32_t { + kOriginal = 0, + kHive8732 = 1, + kHive4243 = 2, + kHive12055 = 3, + kHive13083 = 4, + kOrc101 = 5, + kOrc135 = 6, + kOrc517 = 7, + kOrc203 = 8, + kOrc14 = 9, + kMax = INT32_MAX +}; + +enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression }; + +class ARROW_EXPORT FileVersion { + private: + int32_t major_version_; + int32_t minor_version_; + + public: + static const FileVersion& v_0_11(); + static const FileVersion& v_0_12(); + + FileVersion(int32_t major, int32_t minor) + : major_version_(major), minor_version_(minor) {} + + /** + * Get major version + */ + int32_t major_version() const { return this->major_version_; } + + /** + * Get minor version + */ + int32_t minor_version() const { return this->minor_version_; } + + bool operator==(const FileVersion& right) const { + return this->major_version() == right.major_version() && + this->minor_version() == right.minor_version(); + } + + bool operator!=(const FileVersion& right) const { return !(*this == right); } + + std::string ToString() const; +}; + +/// Options for the ORC Writer +struct ARROW_EXPORT WriteOptions { + /// Number of rows the ORC writer writes at a time, default 1024 + int64_t batch_size = 1024; + /// Which ORC file version to use, default FileVersion(0, 12) + FileVersion file_version = FileVersion(0, 12); + /// Size of each ORC stripe in bytes, default 64 MiB + int64_t stripe_size = 64 * 1024 * 1024; + /// The compression codec of the ORC file, there is no compression by default + Compression::type compression = Compression::UNCOMPRESSED; + /// The size of each compression block in bytes, default 64 KiB + int64_t compression_block_size = 64 * 1024; + /// The compression strategy i.e. speed vs size reduction, default + /// CompressionStrategy::kSpeed + CompressionStrategy compression_strategy = CompressionStrategy::kSpeed; + /// The number of rows per an entry in the row index, default 10000 + int64_t row_index_stride = 10000; + /// The padding tolerance, default 0.0 + double padding_tolerance = 0.0; + /// The dictionary key size threshold. 0 to disable dictionary encoding. + /// 1 to always enable dictionary encoding, default 0.0 + double dictionary_key_size_threshold = 0.0; + /// The array of columns that use the bloom filter, default empty + std::vector bloom_filter_columns; + /// The upper limit of the false-positive rate of the bloom filter, default 0.05 + double bloom_filter_fpp = 0.05; +}; + +} // namespace orc +} // namespace adapters +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/tensorflow/convert.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/tensorflow/convert.h new file mode 100644 index 0000000000000000000000000000000000000000..9d093eddf6b598150ddb55da0e84699a5b7ef4b8 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/adapters/tensorflow/convert.h @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "tensorflow/core/framework/op.h" + +#include "arrow/type.h" + +// These utilities are supposed to be included in TensorFlow operators +// that need to be compiled separately from Arrow because of ABI issues. +// They therefore need to be header-only. + +namespace arrow { + +namespace adapters { + +namespace tensorflow { + +Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr* out) { + switch (dtype) { + case ::tensorflow::DT_BOOL: + *out = arrow::boolean(); + break; + case ::tensorflow::DT_FLOAT: + *out = arrow::float32(); + break; + case ::tensorflow::DT_DOUBLE: + *out = arrow::float64(); + break; + case ::tensorflow::DT_HALF: + *out = arrow::float16(); + break; + case ::tensorflow::DT_INT8: + *out = arrow::int8(); + break; + case ::tensorflow::DT_INT16: + *out = arrow::int16(); + break; + case ::tensorflow::DT_INT32: + *out = arrow::int32(); + break; + case ::tensorflow::DT_INT64: + *out = arrow::int64(); + break; + case ::tensorflow::DT_UINT8: + *out = arrow::uint8(); + break; + case ::tensorflow::DT_UINT16: + *out = arrow::uint16(); + break; + case ::tensorflow::DT_UINT32: + *out = arrow::uint32(); + break; + case ::tensorflow::DT_UINT64: + *out = arrow::uint64(); + break; + default: + return Status::TypeError("TensorFlow data type is not supported"); + } + return Status::OK(); +} + +Status GetTensorFlowType(std::shared_ptr dtype, ::tensorflow::DataType* out) { + switch (dtype->id()) { + case Type::BOOL: + *out = ::tensorflow::DT_BOOL; + break; + case Type::UINT8: + *out = ::tensorflow::DT_UINT8; + break; + case Type::INT8: + *out = ::tensorflow::DT_INT8; + break; + case Type::UINT16: + *out = ::tensorflow::DT_UINT16; + break; + case Type::INT16: + *out = ::tensorflow::DT_INT16; + break; + case Type::UINT32: + *out = ::tensorflow::DT_UINT32; + break; + case Type::INT32: + *out = ::tensorflow::DT_INT32; + break; + case Type::UINT64: + *out = ::tensorflow::DT_UINT64; + break; + case Type::INT64: + *out = ::tensorflow::DT_INT64; + break; + case Type::HALF_FLOAT: + *out = ::tensorflow::DT_HALF; + break; + case Type::FLOAT: + *out = ::tensorflow::DT_FLOAT; + break; + case Type::DOUBLE: + *out = ::tensorflow::DT_DOUBLE; + break; + default: + return Status::TypeError("Arrow data type is not supported"); + } + return arrow::Status::OK(); +} + +} // namespace tensorflow + +} // namespace adapters + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/api.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/api.h new file mode 100644 index 0000000000000000000000000000000000000000..ac568a00eedc32984758f4675b58ac626c9c947a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/api.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Coarse public API while the library is in development + +#pragma once + +#include "arrow/array.h" // IWYU pragma: export +#include "arrow/array/array_run_end.h" // IWYU pragma: export +#include "arrow/array/concatenate.h" // IWYU pragma: export +#include "arrow/buffer.h" // IWYU pragma: export +#include "arrow/builder.h" // IWYU pragma: export +#include "arrow/chunked_array.h" // IWYU pragma: export +#include "arrow/compare.h" // IWYU pragma: export +#include "arrow/config.h" // IWYU pragma: export +#include "arrow/datum.h" // IWYU pragma: export +#include "arrow/extension_type.h" // IWYU pragma: export +#include "arrow/memory_pool.h" // IWYU pragma: export +#include "arrow/pretty_print.h" // IWYU pragma: export +#include "arrow/record_batch.h" // IWYU pragma: export +#include "arrow/result.h" // IWYU pragma: export +#include "arrow/status.h" // IWYU pragma: export +#include "arrow/table.h" // IWYU pragma: export +#include "arrow/table_builder.h" // IWYU pragma: export +#include "arrow/tensor.h" // IWYU pragma: export +#include "arrow/type.h" // IWYU pragma: export +#include "arrow/util/key_value_metadata.h" // IWYU pragma: export +#include "arrow/visit_array_inline.h" // IWYU pragma: export +#include "arrow/visit_scalar_inline.h" // IWYU pragma: export +#include "arrow/visitor.h" // IWYU pragma: export + +/// \brief Top-level namespace for Apache Arrow C++ API +namespace arrow {} diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array.h new file mode 100644 index 0000000000000000000000000000000000000000..4d72ea9506a414fd6e50d5c7d0af437084045e05 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Kitchen-sink public API for arrow::Array data structures. C++ library code +// (especially header files) in Apache Arrow should use more specific headers +// unless it's a file that uses most or all Array types in which case using +// arrow/array.h is fine. + +#pragma once + +/// \defgroup numeric-arrays Concrete classes for numeric arrays +/// @{ +/// @} + +/// \defgroup binary-arrays Concrete classes for binary/string arrays +/// @{ +/// @} + +/// \defgroup nested-arrays Concrete classes for nested arrays +/// @{ +/// @} + +/// \defgroup run-end-encoded-arrays Concrete classes for run-end encoded arrays +/// @{ +/// @} + +#include "arrow/array/array_base.h" // IWYU pragma: keep +#include "arrow/array/array_binary.h" // IWYU pragma: keep +#include "arrow/array/array_decimal.h" // IWYU pragma: keep +#include "arrow/array/array_dict.h" // IWYU pragma: keep +#include "arrow/array/array_nested.h" // IWYU pragma: keep +#include "arrow/array/array_primitive.h" // IWYU pragma: keep +#include "arrow/array/array_run_end.h" // IWYU pragma: keep +#include "arrow/array/data.h" // IWYU pragma: keep +#include "arrow/array/util.h" // IWYU pragma: keep diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_base.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_base.h new file mode 100644 index 0000000000000000000000000000000000000000..e4af67d7e5f0b77d50a1b781ed7805834f22c22c --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_base.h @@ -0,0 +1,317 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/buffer.h" +#include "arrow/compare.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" +#include "arrow/visitor.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// User array accessor types + +/// \brief Array base type +/// Immutable data array with some logical type and some length. +/// +/// Any memory is owned by the respective Buffer instance (or its parents). +/// +/// The base class is only required to have a null bitmap buffer if the null +/// count is greater than 0 +/// +/// If known, the null count can be provided in the base Array constructor. If +/// the null count is not known, pass -1 to indicate that the null count is to +/// be computed on the first call to null_count() +class ARROW_EXPORT Array { + public: + virtual ~Array() = default; + + /// \brief Return true if value at index is null. Does not boundscheck + bool IsNull(int64_t i) const { return !IsValid(i); } + + /// \brief Return true if value at index is valid (not null). Does not + /// boundscheck + bool IsValid(int64_t i) const { + if (null_bitmap_data_ != NULLPTR) { + return bit_util::GetBit(null_bitmap_data_, i + data_->offset); + } + // Dispatching with a few conditionals like this makes IsNull more + // efficient for how it is used in practice. Making IsNull virtual + // would add a vtable lookup to every call and prevent inlining + + // a potential inner-branch removal. + if (type_id() == Type::SPARSE_UNION) { + return !internal::IsNullSparseUnion(*data_, i); + } + if (type_id() == Type::DENSE_UNION) { + return !internal::IsNullDenseUnion(*data_, i); + } + if (type_id() == Type::RUN_END_ENCODED) { + return !internal::IsNullRunEndEncoded(*data_, i); + } + return data_->null_count != data_->length; + } + + /// \brief Return a Scalar containing the value of this array at i + Result> GetScalar(int64_t i) const; + + /// Size in the number of elements this array contains. + int64_t length() const { return data_->length; } + + /// A relative position into another array's data, to enable zero-copy + /// slicing. This value defaults to zero + int64_t offset() const { return data_->offset; } + + /// The number of null entries in the array. If the null count was not known + /// at time of construction (and set to a negative value), then the null + /// count will be computed and cached on the first invocation of this + /// function + int64_t null_count() const; + + /// \brief Computes the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// null_count(). For types that have no validity bitmap, this function will + /// recompute the null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + + const std::shared_ptr& type() const { return data_->type; } + Type::type type_id() const { return data_->type->id(); } + + /// Buffer for the validity (null) bitmap, if any. Note that Union types + /// never have a null bitmap. + /// + /// Note that for `null_count == 0` or for null type, this will be null. + /// This buffer does not account for any slice offset + const std::shared_ptr& null_bitmap() const { return data_->buffers[0]; } + + /// Raw pointer to the null bitmap. + /// + /// Note that for `null_count == 0` or for null type, this will be null. + /// This buffer does not account for any slice offset + const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } + + /// Equality comparison with another array + bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; + bool Equals(const std::shared_ptr& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// \brief Return the formatted unified diff of arrow::Diff between this + /// Array and another Array + std::string Diff(const Array& other) const; + + /// Approximate equality comparison with another array + /// + /// epsilon is only used if this is FloatArray or DoubleArray + bool ApproxEquals(const std::shared_ptr& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + bool ApproxEquals(const Array& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// Compare if the range of slots specified are equal for the given array and + /// this array. end_idx exclusive. This methods does not bounds check. + bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, + const Array& other, + const EqualOptions& = EqualOptions::Defaults()) const; + bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, + const std::shared_ptr& other, + const EqualOptions& = EqualOptions::Defaults()) const; + bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, + int64_t other_start_idx, + const EqualOptions& = EqualOptions::Defaults()) const; + bool RangeEquals(const std::shared_ptr& other, int64_t start_idx, + int64_t end_idx, int64_t other_start_idx, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// \brief Apply the ArrayVisitor::Visit() method specialized to the array type + Status Accept(ArrayVisitor* visitor) const; + + /// Construct a zero-copy view of this array with the given type. + /// + /// This method checks if the types are layout-compatible. + /// Nested types are traversed in depth-first order. Data buffers must have + /// the same item sizes, even though the logical types may be different. + /// An error is returned if the types are not layout-compatible. + Result> View(const std::shared_ptr& type) const; + + /// \brief Construct a copy of the array with all buffers on destination + /// Memory Manager + /// + /// This method recursively copies the array's buffers and those of its children + /// onto the destination MemoryManager device and returns the new Array. + Result> CopyTo(const std::shared_ptr& to) const; + + /// \brief Construct a new array attempting to zero-copy view if possible. + /// + /// Like CopyTo this method recursively goes through all of the array's buffers + /// and those of it's children and first attempts to create zero-copy + /// views on the destination MemoryManager device. If it can't, it falls back + /// to performing a copy. See Buffer::ViewOrCopy. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + + /// Construct a zero-copy slice of the array with the indicated offset and + /// length + /// + /// \param[in] offset the position of the first element in the constructed + /// slice + /// \param[in] length the length of the slice. If there are not enough + /// elements in the array, the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr + std::shared_ptr Slice(int64_t offset, int64_t length) const; + + /// Slice from offset until end of the array + std::shared_ptr Slice(int64_t offset) const; + + /// Input-checking variant of Array::Slice + Result> SliceSafe(int64_t offset, int64_t length) const; + /// Input-checking variant of Array::Slice + Result> SliceSafe(int64_t offset) const; + + const std::shared_ptr& data() const { return data_; } + + int num_fields() const { return static_cast(data_->child_data.size()); } + + /// \return PrettyPrint representation of array suitable for debugging + std::string ToString() const; + + /// \brief Perform cheap validation checks to determine obvious inconsistencies + /// within the array's internal data. + /// + /// This is O(k) where k is the number of descendents. + /// + /// \return Status + Status Validate() const; + + /// \brief Perform extensive validation checks to determine inconsistencies + /// within the array's internal data. + /// + /// This is potentially O(k*n) where k is the number of descendents and n + /// is the array length. + /// + /// \return Status + Status ValidateFull() const; + + /// \brief Return the device_type that this array's data is allocated on + /// + /// This just delegates to calling device_type on the underlying ArrayData + /// object which backs this Array. + /// + /// \return DeviceAllocationType + DeviceAllocationType device_type() const { return data_->device_type(); } + + /// \brief Return the statistics of this Array + /// + /// This just delegates to calling statistics on the underlying ArrayData + /// object which backs this Array. + /// + /// \return const ArrayStatistics& + std::shared_ptr statistics() const { return data_->statistics; } + + protected: + Array() = default; + ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); + + std::shared_ptr data_; + const uint8_t* null_bitmap_data_ = NULLPTR; + + /// Protected method for constructors + void SetData(const std::shared_ptr& data) { + if (data->buffers.size() > 0) { + null_bitmap_data_ = data->GetValuesSafe(0, /*offset=*/0); + } else { + null_bitmap_data_ = NULLPTR; + } + data_ = data; + } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Array); + + ARROW_FRIEND_EXPORT friend void PrintTo(const Array& x, std::ostream* os); +}; + +static inline std::ostream& operator<<(std::ostream& os, const Array& x) { + os << x.ToString(); + return os; +} + +/// Base class for non-nested arrays +class ARROW_EXPORT FlatArray : public Array { + protected: + using Array::Array; +}; + +/// Base class for arrays of fixed-size logical types +class ARROW_EXPORT PrimitiveArray : public FlatArray { + public: + PrimitiveArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// Does not account for any slice offset + const std::shared_ptr& values() const { return data_->buffers[1]; } + + protected: + PrimitiveArray() : raw_values_(NULLPTR) {} + + void SetData(const std::shared_ptr& data) { + this->Array::SetData(data); + raw_values_ = data->GetValuesSafe(1, /*offset=*/0); + } + + explicit PrimitiveArray(const std::shared_ptr& data) { SetData(data); } + + const uint8_t* raw_values_; +}; + +/// Degenerate null type Array +class ARROW_EXPORT NullArray : public FlatArray { + public: + using TypeClass = NullType; + + explicit NullArray(const std::shared_ptr& data) { SetData(data); } + explicit NullArray(int64_t length); + + private: + void SetData(const std::shared_ptr& data) { + null_bitmap_data_ = NULLPTR; + data->null_count = data->length; + data_ = data; + } +}; + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_binary.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_binary.h new file mode 100644 index 0000000000000000000000000000000000000000..63903eac46d413c24ccaeb048273e8f5e6c8d3c6 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_binary.h @@ -0,0 +1,321 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor classes for Binary, LargeBinary, String, LargeString, +// FixedSizeBinary + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/buffer.h" +#include "arrow/stl_iterator.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup binary-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// Binary and String + +/// Base class for variable-sized binary arrays, regardless of offset size +/// and logical interpretation. +template +class BaseBinaryArray : public FlatArray { + public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + using IteratorType = stl::ArrayIterator>; + + /// Return the pointer to the given elements bytes + // XXX should GetValue(int64_t i) return a string_view? + const uint8_t* GetValue(int64_t i, offset_type* out_length) const { + const offset_type pos = raw_value_offsets_[i]; + *out_length = raw_value_offsets_[i + 1] - pos; + return raw_data_ + pos; + } + + /// \brief Get binary value as a string_view + /// + /// \param i the value index + /// \return the view over the selected value + std::string_view GetView(int64_t i) const { + const offset_type pos = raw_value_offsets_[i]; + return std::string_view(reinterpret_cast(raw_data_ + pos), + raw_value_offsets_[i + 1] - pos); + } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + /// \brief Get binary value as a string_view + /// Provided for consistency with other arrays. + /// + /// \param i the value index + /// \return the view over the selected value + std::string_view Value(int64_t i) const { return GetView(i); } + + /// \brief Get binary value as a std::string + /// + /// \param i the value index + /// \return the value copied into a std::string + std::string GetString(int64_t i) const { return std::string(GetView(i)); } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return data_->buffers[1]; } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_data() const { return data_->buffers[2]; } + + const offset_type* raw_value_offsets() const { return raw_value_offsets_; } + + const uint8_t* raw_data() const { return raw_data_; } + + /// \brief Return the data buffer absolute offset of the data for the value + /// at the passed index. + /// + /// Does not perform boundschecking + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; } + + /// \brief Return the length of the data for the value at the passed index. + /// + /// Does not perform boundschecking + offset_type value_length(int64_t i) const { + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; + } + + /// \brief Return the total length of the memory in the data buffer + /// referenced by this array. If the array has been sliced then this may be + /// less than the size of the data buffer (data_->buffers[2]). + offset_type total_values_length() const { + if (data_->length > 0) { + return raw_value_offsets_[data_->length] - raw_value_offsets_[0]; + } else { + return 0; + } + } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + // For subclasses + BaseBinaryArray() = default; + + // Protected method for constructors + void SetData(const std::shared_ptr& data) { + this->Array::SetData(data); + raw_value_offsets_ = data->GetValuesSafe(1); + raw_data_ = data->GetValuesSafe(2, /*offset=*/0); + } + + const offset_type* raw_value_offsets_ = NULLPTR; + const uint8_t* raw_data_ = NULLPTR; +}; + +/// Concrete Array class for variable-size binary data +class ARROW_EXPORT BinaryArray : public BaseBinaryArray { + public: + explicit BinaryArray(const std::shared_ptr& data); + + BinaryArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + protected: + // For subclasses such as StringArray + BinaryArray() : BaseBinaryArray() {} +}; + +/// Concrete Array class for variable-size string (utf-8) data +class ARROW_EXPORT StringArray : public BinaryArray { + public: + using TypeClass = StringType; + + explicit StringArray(const std::shared_ptr& data); + + StringArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + +/// Concrete Array class for large variable-size binary data +class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray { + public: + explicit LargeBinaryArray(const std::shared_ptr& data); + + LargeBinaryArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + protected: + // For subclasses such as LargeStringArray + LargeBinaryArray() : BaseBinaryArray() {} +}; + +/// Concrete Array class for large variable-size string (utf-8) data +class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { + public: + using TypeClass = LargeStringType; + + explicit LargeStringArray(const std::shared_ptr& data); + + LargeStringArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// BinaryViewType::c_type struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public FlatArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator; + using c_type = BinaryViewType::c_type; + + explicit BinaryViewArray(std::shared_ptr data); + + BinaryViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr views, BufferVector data_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const; + std::string GetString(int64_t i) const { return std::string{GetView(i)}; } + + const auto& values() const { return data_->buffers[1]; } + const c_type* raw_values() const { return raw_values_; } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using FlatArray::FlatArray; + + void SetData(std::shared_ptr data) { + FlatArray::SetData(std::move(data)); + raw_values_ = data_->GetValuesSafe(1); + } + + const c_type* raw_values_; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// BinaryViewType::c_type to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(std::shared_ptr data); + + using BinaryViewArray::BinaryViewArray; + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + +// ---------------------------------------------------------------------- +// Fixed width binary + +/// Concrete Array class for fixed-size binary data +class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { + public: + using TypeClass = FixedSizeBinaryType; + using IteratorType = stl::ArrayIterator; + + explicit FixedSizeBinaryArray(const std::shared_ptr& data); + + FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const uint8_t* GetValue(int64_t i) const { return values_ + i * byte_width_; } + const uint8_t* Value(int64_t i) const { return GetValue(i); } + + std::string_view GetView(int64_t i) const { + return std::string_view(reinterpret_cast(GetValue(i)), byte_width_); + } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + std::string GetString(int64_t i) const { return std::string(GetView(i)); } + + int32_t byte_width() const { return byte_width_; } + + const uint8_t* raw_values() const { return values_; } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + void SetData(const std::shared_ptr& data) { + this->PrimitiveArray::SetData(data); + byte_width_ = + internal::checked_cast(*type()).byte_width(); + values_ = raw_values_ + data_->offset * byte_width_; + } + + const uint8_t* values_; + int32_t byte_width_; +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_decimal.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_decimal.h new file mode 100644 index 0000000000000000000000000000000000000000..2f10bb842999640a8cada703ff12ea29c0e5f718 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_decimal.h @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/array_binary.h" +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup numeric-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// Decimal32Array + +/// Concrete Array class for 32-bit decimal data +class ARROW_EXPORT Decimal32Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal32Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal32Array from ArrayData instance + explicit Decimal32Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +// ---------------------------------------------------------------------- +// Decimal64Array + +/// Concrete Array class for 64-bit decimal data +class ARROW_EXPORT Decimal64Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal64Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal64Array from ArrayData instance + explicit Decimal64Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +// ---------------------------------------------------------------------- +// Decimal128Array + +/// Concrete Array class for 128-bit decimal data +class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal128Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal128Array from ArrayData instance + explicit Decimal128Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +// Backward compatibility +using DecimalArray = Decimal128Array; + +// ---------------------------------------------------------------------- +// Decimal256Array + +/// Concrete Array class for 256-bit decimal data +class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal256Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal256Array from ArrayData instance + explicit Decimal256Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_dict.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_dict.h new file mode 100644 index 0000000000000000000000000000000000000000..bf376b51f8c9470d2b4e4c7ed950c9a513fddc9b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_dict.h @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// DictionaryArray + +/// \brief Array type for dictionary-encoded data with a +/// data-dependent dictionary +/// +/// A dictionary array contains an array of non-negative integers (the +/// "dictionary indices") along with a data type containing a "dictionary" +/// corresponding to the distinct values represented in the data. +/// +/// For example, the array +/// +/// ["foo", "bar", "foo", "bar", "foo", "bar"] +/// +/// with dictionary ["bar", "foo"], would have dictionary array representation +/// +/// indices: [1, 0, 1, 0, 1, 0] +/// dictionary: ["bar", "foo"] +/// +/// The indices in principle may be any integer type. +class ARROW_EXPORT DictionaryArray : public Array { + public: + using TypeClass = DictionaryType; + + explicit DictionaryArray(const std::shared_ptr& data); + + DictionaryArray(const std::shared_ptr& type, + const std::shared_ptr& indices, + const std::shared_ptr& dictionary); + + /// \brief Construct DictionaryArray from dictionary and indices + /// array and validate + /// + /// This function does the validation of the indices and input type. It checks if + /// all indices are non-negative and smaller than the size of the dictionary. + /// + /// \param[in] type a dictionary type + /// \param[in] dictionary the dictionary with same value type as the + /// type object + /// \param[in] indices an array of non-negative integers smaller than the + /// size of the dictionary + static Result> FromArrays( + const std::shared_ptr& type, const std::shared_ptr& indices, + const std::shared_ptr& dictionary); + + static Result> FromArrays( + const std::shared_ptr& indices, const std::shared_ptr& dictionary) { + return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices, + dictionary); + } + + /// \brief Transpose this DictionaryArray + /// + /// This method constructs a new dictionary array with the given dictionary + /// type, transposing indices using the transpose map. The type and the + /// transpose map are typically computed using DictionaryUnifier. + /// + /// \param[in] type the new type object + /// \param[in] dictionary the new dictionary + /// \param[in] transpose_map transposition array of this array's indices + /// into the target array's indices + /// \param[in] pool a pool to allocate the array data from + Result> Transpose( + const std::shared_ptr& type, const std::shared_ptr& dictionary, + const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const; + + Result> Compact(MemoryPool* pool = default_memory_pool()) const; + + /// \brief Determine whether dictionary arrays may be compared without unification + bool CanCompareIndices(const DictionaryArray& other) const; + + /// \brief Return the dictionary for this array, which is stored as + /// a member of the ArrayData internal structure + const std::shared_ptr& dictionary() const; + const std::shared_ptr& indices() const; + + /// \brief Return the ith value of indices, cast to int64_t. Not recommended + /// for use in performance-sensitive code. Does not validate whether the + /// value is null or out-of-bounds. + int64_t GetValueIndex(int64_t i) const; + + const DictionaryType* dict_type() const { return dict_type_; } + + private: + void SetData(const std::shared_ptr& data); + const DictionaryType* dict_type_; + std::shared_ptr indices_; + + // Lazily initialized when invoking dictionary() + mutable std::shared_ptr dictionary_; +}; + +/// \brief Helper class for incremental dictionary unification +class ARROW_EXPORT DictionaryUnifier { + public: + virtual ~DictionaryUnifier() = default; + + /// \brief Construct a DictionaryUnifier + /// \param[in] value_type the data type of the dictionaries + /// \param[in] pool MemoryPool to use for memory allocations + static Result> Make( + std::shared_ptr value_type, MemoryPool* pool = default_memory_pool()); + + /// \brief Unify dictionaries across array chunks + /// + /// The dictionaries in the array chunks will be unified, their indices + /// accordingly transposed. + /// + /// Only dictionaries with a primitive value type are currently supported. + /// However, dictionaries nested inside a more complex type are correctly unified. + static Result> UnifyChunkedArray( + const std::shared_ptr& array, + MemoryPool* pool = default_memory_pool()); + + /// \brief Unify dictionaries across the chunks of each table column + /// + /// The dictionaries in each table column will be unified, their indices + /// accordingly transposed. + /// + /// Only dictionaries with a primitive value type are currently supported. + /// However, dictionaries nested inside a more complex type are correctly unified. + static Result> UnifyTable( + const Table& table, MemoryPool* pool = default_memory_pool()); + + /// \brief Append dictionary to the internal memo + virtual Status Unify(const Array& dictionary) = 0; + + /// \brief Append dictionary and compute transpose indices + /// \param[in] dictionary the dictionary values to unify + /// \param[out] out_transpose a Buffer containing computed transpose indices + /// as int32_t values equal in length to the passed dictionary. The value in + /// each slot corresponds to the new index value for each original index + /// for a DictionaryArray with the old dictionary + virtual Status Unify(const Array& dictionary, + std::shared_ptr* out_transpose) = 0; + + /// \brief Return a result DictionaryType with the smallest possible index + /// type to accommodate the unified dictionary. The unifier cannot be used + /// after this is called + virtual Status GetResult(std::shared_ptr* out_type, + std::shared_ptr* out_dict) = 0; + + /// \brief Return a unified dictionary with the given index type. If + /// the index type is not large enough then an invalid status will be returned. + /// The unifier cannot be used after this is called + virtual Status GetResultWithIndexType(const std::shared_ptr& index_type, + std::shared_ptr* out_dict) = 0; +}; + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_nested.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_nested.h new file mode 100644 index 0000000000000000000000000000000000000000..f122f9378b52592403633f62ff50d8e804b02d12 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_nested.h @@ -0,0 +1,887 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList, +// Map, Struct, and Union + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup nested-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// VarLengthListLikeArray + +template +class VarLengthListLikeArray; + +namespace internal { + +// Private helper for [Large]List[View]Array::SetData. +// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header +// doesn't play well with MSVC. +template +void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, + Type::type expected_type_id = TYPE::type_id); + +/// \brief A version of Flatten that keeps recursively flattening until an array of +/// non-list values is reached. +/// +/// Array types considered to be lists by this function: +/// - list +/// - large_list +/// - list_view +/// - large_list_view +/// - fixed_size_list +/// +/// \see ListArray::Flatten +ARROW_EXPORT Result> FlattenLogicalListRecursively( + const Array& in_array, MemoryPool* memory_pool); + +} // namespace internal + +/// Base class for variable-sized list and list-view arrays, regardless of offset size. +template +class VarLengthListLikeArray : public Array { + public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + + const TypeClass* var_length_list_like_type() const { return this->list_type_; } + + /// \brief Return array object containing the list's values + /// + /// Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& values() const { return values_; } + + /// Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& value_offsets() const { return data_->buffers[1]; } + + const std::shared_ptr& value_type() const { return list_type_->value_type(); } + + /// Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_offsets() const { return raw_value_offsets_; } + + // The following functions will not perform boundschecking + + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists and list-views are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + virtual offset_type value_length(int64_t i) const = 0; + + /// \pre IsValid(i) + std::shared_ptr value_slice(int64_t i) const { + return values_->Slice(value_offset(i), value_length(i)); + } + + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + + protected: + friend void internal::SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, + Type::type expected_type_id); + + const TypeClass* list_type_ = NULLPTR; + std::shared_ptr values_; + const offset_type* raw_value_offsets_ = NULLPTR; +}; + +// ---------------------------------------------------------------------- +// ListArray / LargeListArray + +template +class BaseListArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_type() const { return this->var_length_list_like_type(); } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { + return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; + } +}; + +/// Concrete Array class for list data +class ARROW_EXPORT ListArray : public BaseListArray { + public: + explicit ListArray(std::shared_ptr data); + + ListArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListArray from array of offsets and child value array + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. + /// + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int32 type + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// allocated because of null values + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a ListArray from a ListViewArray + static Result> FromListView(const ListViewArray& source, + MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the lists in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration of this array's offsets as well as null elements backed + /// by non-empty lists (they are skipped, thus copying may be needed). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list offsets as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + protected: + // This constructor defers SetData to a derived array class + ListArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +/// Concrete Array class for large list data (with 64-bit offsets) +class ARROW_EXPORT LargeListArray : public BaseListArray { + public: + explicit LargeListArray(const std::shared_ptr& data); + + LargeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListArray from array of offsets and child value array + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. + /// + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int64 type + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// allocated because of null values + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a LargeListArray from a LargeListViewArray + static Result> FromListView( + const LargeListViewArray& source, MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the lists in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration of this array's offsets as well as null elements backed + /// by non-empty lists (they are skipped, thus copying may be needed). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list offsets as an Int64Array + std::shared_ptr offsets() const; + + protected: + void SetData(const std::shared_ptr& data); +}; + +// ---------------------------------------------------------------------- +// ListViewArray / LargeListViewArray + +template +class BaseListViewArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_view_type() const { return this->var_length_list_like_type(); } + + /// \brief Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& value_sizes() const { return this->data_->buffers[2]; } + + /// \brief Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_sizes() const { return raw_value_sizes_; } + + /// \brief Return the size of the value at a particular index + /// + /// This should not be called if the list-view at slot i is null. + /// The returned size in those cases could be any value from 0 to the + /// length of the child values array. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { return this->raw_value_sizes_[i]; } + + protected: + const offset_type* raw_value_sizes_ = NULLPTR; +}; + +/// \brief Concrete Array class for list-view data +class ARROW_EXPORT ListViewArray : public BaseListViewArray { + public: + explicit ListViewArray(std::shared_ptr data); + + ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct a ListViewArray using buffers from offsets and sizes arrays + /// that project views into the child values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the + /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array + /// can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int32 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int32 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a ListViewArray from a ListArray + static Result> FromList(const ListArray& list_array, + MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the list-views in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + /// + /// This function invokes Concatenate() if list-views are non-contiguous. It + /// will try to minimize the number of array slices passed to Concatenate() by + /// maximizing the size of each slice (containing as many contiguous + /// list-views as possible). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + ListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +/// \brief Concrete Array class for large list-view data (with 64-bit offsets +/// and sizes) +class ARROW_EXPORT LargeListViewArray : public BaseListViewArray { + public: + explicit LargeListViewArray(std::shared_ptr data); + + LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct an LargeListViewArray using buffers from offsets and sizes arrays + /// that project views into the values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or + /// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a + /// null_bitmap is provided, the offsets array and the sizes array can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int64 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int64 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a LargeListViewArray from a LargeListArray + static Result> FromList( + const LargeListArray& list_array, MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the large list-views in this + /// array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + LargeListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +// ---------------------------------------------------------------------- +// MapArray + +/// Concrete Array class for map data +/// +/// NB: "value" in this context refers to a pair of a key and the corresponding item +class ARROW_EXPORT MapArray : public ListArray { + public: + using TypeClass = MapType; + + explicit MapArray(const std::shared_ptr& data); + + MapArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& keys, const std::shared_ptr& items, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + MapArray(const std::shared_ptr& type, int64_t length, BufferVector buffers, + const std::shared_ptr& keys, const std::shared_ptr& items, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + MapArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct MapArray from array of offsets and child key, item arrays + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int32 type + /// \param[in] keys Array containing key values + /// \param[in] items Array containing item values + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// \param[in] null_bitmap Optional validity bitmap + /// allocated because of null values + static Result> FromArrays( + const std::shared_ptr& offsets, const std::shared_ptr& keys, + const std::shared_ptr& items, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR); + + static Result> FromArrays( + std::shared_ptr type, const std::shared_ptr& offsets, + const std::shared_ptr& keys, const std::shared_ptr& items, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR); + + const MapType* map_type() const { return map_type_; } + + /// \brief Return array object containing all map keys + const std::shared_ptr& keys() const { return keys_; } + + /// \brief Return array object containing all mapped items + const std::shared_ptr& items() const { return items_; } + + /// Validate child data before constructing the actual MapArray. + static Status ValidateChildData( + const std::vector>& child_data); + + protected: + void SetData(const std::shared_ptr& data); + + static Result> FromArraysInternal( + std::shared_ptr type, const std::shared_ptr& offsets, + const std::shared_ptr& keys, const std::shared_ptr& items, + MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR); + + private: + const MapType* map_type_; + std::shared_ptr keys_, items_; +}; + +// ---------------------------------------------------------------------- +// FixedSizeListArray + +/// Concrete Array class for fixed size list data +class ARROW_EXPORT FixedSizeListArray : public Array { + public: + using TypeClass = FixedSizeListType; + using offset_type = TypeClass::offset_type; + + explicit FixedSizeListArray(const std::shared_ptr& data); + + FixedSizeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const FixedSizeListType* list_type() const; + + /// \brief Return array object containing the list's values + const std::shared_ptr& values() const; + + const std::shared_ptr& value_type() const; + + // The following functions will not perform boundschecking + int64_t value_offset(int64_t i) const { + i += data_->offset; + return list_size_ * i; + } + /// \brief Return the fixed-size of the values + /// + /// No matter the value of the index parameter, the result is the same. + /// So even when the value at slot i is null, this function will return a + /// non-zero size. + /// + /// \pre IsValid(i) + int32_t value_length(int64_t i = 0) const { + ARROW_UNUSED(i); + return list_size_; + } + /// \pre IsValid(i) + std::shared_ptr value_slice(int64_t i) const { + return values_->Slice(value_offset(i), value_length(i)); + } + + /// \brief Return an Array that is a concatenation of the lists in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration null elements (they are skipped, thus copying may be needed). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + + /// \brief Construct FixedSizeListArray from child value array and value_length + /// + /// \param[in] values Array containing list values + /// \param[in] list_size The fixed length of each list + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + /// \return Will have length equal to values.length() / list_size + static Result> FromArrays( + const std::shared_ptr& values, int32_t list_size, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Construct FixedSizeListArray from child value array and type + /// + /// \param[in] values Array containing list values + /// \param[in] type The fixed sized list type + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + /// \return Will have length equal to values.length() / type.list_size() + static Result> FromArrays( + const std::shared_ptr& values, std::shared_ptr type, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + protected: + void SetData(const std::shared_ptr& data); + int32_t list_size_; + + private: + std::shared_ptr values_; +}; + +// ---------------------------------------------------------------------- +// Struct + +/// Concrete Array class for struct data +class ARROW_EXPORT StructArray : public Array { + public: + using TypeClass = StructType; + + explicit StructArray(const std::shared_ptr& data); + + StructArray(const std::shared_ptr& type, int64_t length, + const std::vector>& children, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Return a StructArray from child arrays and field names. + /// + /// The length and data type are automatically inferred from the arguments. + /// There should be at least one child array. + static Result> Make( + const ArrayVector& children, const std::vector& field_names, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Return a StructArray from child arrays and fields. + /// + /// The length is automatically inferred from the arguments. + /// There should be at least one child array. This method does not + /// check that field types and child array types are consistent. + static Result> Make( + const ArrayVector& children, const FieldVector& fields, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const StructType* struct_type() const; + + // Return a shared pointer in case the requestor desires to share ownership + // with this array. The returned array has its offset, length and null + // count adjusted. + const std::shared_ptr& field(int pos) const; + + const ArrayVector& fields() const; + + /// Returns null if name not found + std::shared_ptr GetFieldByName(const std::string& name) const; + + /// Indicate if field named `name` can be found unambiguously in the struct. + Status CanReferenceFieldByName(const std::string& name) const; + + /// Indicate if fields named `names` can be found unambiguously in the struct. + Status CanReferenceFieldsByNames(const std::vector& names) const; + + /// \brief Flatten this array as a vector of arrays, one for each field + /// + /// \param[in] pool The pool to allocate null bitmaps from, if necessary + Result Flatten(MemoryPool* pool = default_memory_pool()) const; + + /// \brief Get one of the child arrays, combining its null bitmap + /// with the parent struct array's bitmap. + /// + /// \param[in] index Which child array to get + /// \param[in] pool The pool to allocate null bitmaps from, if necessary + Result> GetFlattenedField( + int index, MemoryPool* pool = default_memory_pool()) const; + + private: + // For caching boxed child data + // XXX This is not handled in a thread-safe manner. + mutable ArrayVector boxed_fields_; +}; + +// ---------------------------------------------------------------------- +// Union + +/// Base class for SparseUnionArray and DenseUnionArray +class ARROW_EXPORT UnionArray : public Array { + public: + using type_code_t = int8_t; + + /// Note that this buffer does not account for any slice offset + const std::shared_ptr& type_codes() const { return data_->buffers[1]; } + + const type_code_t* raw_type_codes() const { return raw_type_codes_; } + + /// The logical type code of the value at index. + type_code_t type_code(int64_t i) const { return raw_type_codes_[i]; } + + /// The physical child id containing value at index. + int child_id(int64_t i) const { return union_type_->child_ids()[raw_type_codes_[i]]; } + + const UnionType* union_type() const { return union_type_; } + + UnionMode::type mode() const { return union_type_->mode(); } + + /// \brief Return the given field as an individual array. + /// + /// For sparse unions, the returned array has its offset, length and null + /// count adjusted. + std::shared_ptr field(int pos) const; + + protected: + void SetData(std::shared_ptr data); + + const type_code_t* raw_type_codes_; + const UnionType* union_type_; + + // For caching boxed child data + mutable std::vector> boxed_fields_; +}; + +/// Concrete Array class for sparse union data +class ARROW_EXPORT SparseUnionArray : public UnionArray { + public: + using TypeClass = SparseUnionType; + + explicit SparseUnionArray(std::shared_ptr data); + + SparseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, + std::shared_ptr type_ids, int64_t offset = 0); + + /// \brief Construct SparseUnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the input types. + /// + /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] type_codes Vector of type codes. + static Result> Make(const Array& type_ids, ArrayVector children, + std::vector type_codes) { + return Make(std::move(type_ids), std::move(children), std::vector{}, + std::move(type_codes)); + } + + /// \brief Construct SparseUnionArray with custom field names from type_ids and children + /// + /// This function does the bare minimum of validation of the input types. + /// + /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. + /// \param[in] type_codes Vector of type codes. + static Result> Make(const Array& type_ids, ArrayVector children, + std::vector field_names = {}, + std::vector type_codes = {}); + + const SparseUnionType* union_type() const { + return internal::checked_cast(union_type_); + } + + /// \brief Get one of the child arrays, adjusting its null bitmap + /// where the union array type code does not match. + /// + /// \param[in] index Which child array to get (i.e. the physical index, not the type + /// code) \param[in] pool The pool to allocate null bitmaps from, if necessary + Result> GetFlattenedField( + int index, MemoryPool* pool = default_memory_pool()) const; + + protected: + void SetData(std::shared_ptr data); +}; + +/// \brief Concrete Array class for dense union data +/// +/// Note that union types do not have a validity bitmap +class ARROW_EXPORT DenseUnionArray : public UnionArray { + public: + using TypeClass = DenseUnionType; + + explicit DenseUnionArray(const std::shared_ptr& data); + + DenseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, + std::shared_ptr type_ids, + std::shared_ptr value_offsets = NULLPTR, int64_t offset = 0); + + /// \brief Construct DenseUnionArray from type_ids, value_offsets, and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] type_codes Vector of type codes. + static Result> Make(const Array& type_ids, + const Array& value_offsets, + ArrayVector children, + std::vector type_codes) { + return Make(type_ids, value_offsets, std::move(children), std::vector{}, + std::move(type_codes)); + } + + /// \brief Construct DenseUnionArray with custom field names from type_ids, + /// value_offsets, and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. + /// \param[in] type_codes Vector of type codes. + static Result> Make(const Array& type_ids, + const Array& value_offsets, + ArrayVector children, + std::vector field_names = {}, + std::vector type_codes = {}); + + const DenseUnionType* union_type() const { + return internal::checked_cast(union_type_); + } + + /// Note that this buffer does not account for any slice offset + const std::shared_ptr& value_offsets() const { return data_->buffers[2]; } + + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i]; } + + const int32_t* raw_value_offsets() const { return raw_value_offsets_; } + + protected: + const int32_t* raw_value_offsets_; + + void SetData(const std::shared_ptr& data); +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_primitive.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_primitive.h new file mode 100644 index 0000000000000000000000000000000000000000..3e2893b7dd89802b2610e057169e564fd4ca69ce --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_primitive.h @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor types for primitive/C-type-based arrays, such as numbers, +// boolean, and temporal types. + +#pragma once + +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/stl_iterator.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" // IWYU pragma: export +#include "arrow/type_traits.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// Concrete Array class for boolean data +class ARROW_EXPORT BooleanArray : public PrimitiveArray { + public: + using TypeClass = BooleanType; + using IteratorType = stl::ArrayIterator; + + explicit BooleanArray(const std::shared_ptr& data); + + BooleanArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + bool Value(int64_t i) const { + return bit_util::GetBit(reinterpret_cast(raw_values_), + i + data_->offset); + } + + bool GetView(int64_t i) const { return Value(i); } + + std::optional operator[](int64_t i) const { return *IteratorType(*this, i); } + + /// \brief Return the number of false (0) values among the valid + /// values. Result is not cached. + int64_t false_count() const; + + /// \brief Return the number of true (1) values among the valid + /// values. Result is not cached. + int64_t true_count() const; + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using PrimitiveArray::PrimitiveArray; +}; + +/// \addtogroup numeric-arrays +/// +/// @{ + +/// \brief Concrete Array class for numeric data with a corresponding C type +/// +/// This class is templated on the corresponding DataType subclass for the +/// given data, for example NumericArray or NumericArray. +/// +/// Note that convenience aliases are available for all accepted types +/// (for example Int8Array for NumericArray). +template +class NumericArray : public PrimitiveArray { + public: + using TypeClass = TYPE; + using value_type = typename TypeClass::c_type; + using IteratorType = stl::ArrayIterator>; + + explicit NumericArray(const std::shared_ptr& data) { SetData(data); } + + // Only enable this constructor without a type argument for types without additional + // metadata + template + NumericArray(enable_if_parameter_free length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) { + SetData(ArrayData::Make(TypeTraits::type_singleton(), length, {null_bitmap, data}, + null_count, offset)); + } + + const value_type* raw_values() const { return values_; } + + value_type Value(int64_t i) const { return values_[i]; } + + // For API compatibility with BinaryArray etc. + value_type GetView(int64_t i) const { return values_[i]; } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using PrimitiveArray::PrimitiveArray; + + void SetData(const std::shared_ptr& data) { + this->PrimitiveArray::SetData(data); + values_ = raw_values_ + ? (reinterpret_cast(raw_values_) + data_->offset) + : NULLPTR; + } + + const value_type* values_; +}; + +/// DayTimeArray +/// --------------------- +/// \brief Array of Day and Millisecond values. +class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { + public: + using TypeClass = DayTimeIntervalType; + using IteratorType = stl::ArrayIterator; + + explicit DayTimeIntervalArray(const std::shared_ptr& data); + + DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + DayTimeIntervalArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + TypeClass::DayMilliseconds GetValue(int64_t i) const; + TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); } + + // For compatibility with Take kernel. + TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } + + const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } +}; + +/// \brief Array of Month, Day and nanosecond values. +class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray { + public: + using TypeClass = MonthDayNanoIntervalType; + using IteratorType = stl::ArrayIterator; + + explicit MonthDayNanoIntervalArray(const std::shared_ptr& data); + + MonthDayNanoIntervalArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + TypeClass::MonthDayNanos GetValue(int64_t i) const; + TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); } + + // For compatibility with Take kernel. + TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); } + + const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_run_end.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_run_end.h new file mode 100644 index 0000000000000000000000000000000000000000..b46b0855ab36776eec4e22cef1a35112e2d18fa8 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_run_end.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor classes run-end encoded arrays + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup run-end-encoded-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// RunEndEncoded + +/// \brief Array type for run-end encoded data +class ARROW_EXPORT RunEndEncodedArray : public Array { + private: + std::shared_ptr run_ends_array_; + std::shared_ptr values_array_; + + public: + using TypeClass = RunEndEncodedType; + + explicit RunEndEncodedArray(const std::shared_ptr& data); + + /// \brief Construct a RunEndEncodedArray from all parameters + /// + /// The length and offset parameters refer to the dimensions of the logical + /// array which is the array we would get after expanding all the runs into + /// repeated values. As such, length can be much greater than the length of + /// the child run_ends and values arrays. + RunEndEncodedArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& run_ends, + const std::shared_ptr& values, int64_t offset = 0); + + /// \brief Construct a RunEndEncodedArray from all parameters + /// + /// The length and offset parameters refer to the dimensions of the logical + /// array which is the array we would get after expanding all the runs into + /// repeated values. As such, length can be much greater than the length of + /// the child run_ends and values arrays. + static Result> Make( + const std::shared_ptr& type, int64_t logical_length, + const std::shared_ptr& run_ends, const std::shared_ptr& values, + int64_t logical_offset = 0); + + /// \brief Construct a RunEndEncodedArray from values and run ends arrays + /// + /// The data type is automatically inferred from the arguments. + /// The run_ends and values arrays must have the same length. + static Result> Make( + int64_t logical_length, const std::shared_ptr& run_ends, + const std::shared_ptr& values, int64_t logical_offset = 0); + + protected: + void SetData(const std::shared_ptr& data); + + public: + /// \brief Returns an array holding the logical indexes of each run-end + /// + /// The physical offset to the array is applied. + const std::shared_ptr& run_ends() const { return run_ends_array_; } + + /// \brief Returns an array holding the values of each run + /// + /// The physical offset to the array is applied. + const std::shared_ptr& values() const { return values_array_; } + + /// \brief Returns an array holding the logical indexes of each run end + /// + /// If a non-zero logical offset is set, this function allocates a new + /// array and rewrites all the run end values to be relative to the logical + /// offset and cuts the end of the array to the logical length. + Result> LogicalRunEnds(MemoryPool* pool) const; + + /// \brief Returns an array holding the values of each run + /// + /// If a non-zero logical offset is set, this function allocates a new + /// array containing only the values within the logical range. + std::shared_ptr LogicalValues() const; + + /// \brief Find the physical offset of this REE array + /// + /// This function uses binary-search, so it has a O(log N) cost. + int64_t FindPhysicalOffset() const; + + /// \brief Find the physical length of this REE array + /// + /// The physical length of an REE is the number of physical values (and + /// run-ends) necessary to represent the logical range of values from offset + /// to length. + /// + /// Avoid calling this function if the physical length can be established in + /// some other way (e.g. when iterating over the runs sequentially until the + /// end). This function uses binary-search, so it has a O(log N) cost. + int64_t FindPhysicalLength() const; +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_adaptive.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_adaptive.h new file mode 100644 index 0000000000000000000000000000000000000000..0cea571be3e3244741f3df15f87c8958eedddf76 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_adaptive.h @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup numeric-builders +/// +/// @{ + +namespace internal { + +class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { + public: + AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment); + + explicit AdaptiveIntBuilderBase(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : AdaptiveIntBuilderBase(sizeof(uint8_t), pool, alignment) {} + + /// \brief Append multiple nulls + /// \param[in] length the number of nulls to append + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(CommitPendingData()); + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeSetNull(length); + } + return Status::OK(); + } + + Status AppendNull() final { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 0; + pending_has_nulls_ = true; + ++pending_pos_; + ++length_; + ++null_count_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(CommitPendingData()); + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeSetNotNull(length); + } + return Status::OK(); + } + + Status AppendEmptyValue() final { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + ++length_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + Status AppendInternal(const uint64_t val) { + pending_data_[pending_pos_] = val; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + ++length_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + virtual Status CommitPendingData() = 0; + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); + template + typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); + + std::shared_ptr data_; + uint8_t* raw_data_ = NULLPTR; + + const uint8_t start_int_size_; + uint8_t int_size_; + + static constexpr int32_t pending_size_ = 1024; + uint8_t pending_valid_[pending_size_]; + uint64_t pending_data_[pending_size_]; + int32_t pending_pos_ = 0; + bool pending_has_nulls_ = false; +}; + +} // namespace internal + +class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveUIntBuilder(uint8_t start_int_size, + MemoryPool* pool = default_memory_pool()); + + explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool()) + : AdaptiveUIntBuilder(sizeof(uint8_t), pool) {} + + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const uint64_t val) { return AppendInternal(val); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + Status ExpandIntSizeN(); +}; + +class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveIntBuilder(uint8_t start_int_size, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {} + + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const int64_t val) { return AppendInternal(static_cast(val)); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const int64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + Status ExpandIntSizeN(); +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_base.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_base.h new file mode 100644 index 0000000000000000000000000000000000000000..ecd2136f5d20ba126bd359977ea17f76c4fe23ed --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_base.h @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include // IWYU pragma: keep +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_primitive.h" +#include "arrow/buffer.h" +#include "arrow/buffer_builder.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace internal { + +template +class ArrayBuilderExtraOps { + public: + /// \brief Append a value from an optional or null if it has no value. + Status AppendOrNull(const std::optional& value) { + auto* self = static_cast(this); + return value.has_value() ? self->Append(*value) : self->AppendNull(); + } + + /// \brief Append a value from an optional or null if it has no value. + /// + /// Unsafe methods don't check existing size. + void UnsafeAppendOrNull(const std::optional& value) { + auto* self = static_cast(this); + return value.has_value() ? self->UnsafeAppend(*value) : self->UnsafeAppendNull(); + } +}; + +} // namespace internal + +/// \defgroup numeric-builders Concrete builder subclasses for numeric types +/// @{ +/// @} + +/// \defgroup temporal-builders Concrete builder subclasses for temporal types +/// @{ +/// @} + +/// \defgroup binary-builders Concrete builder subclasses for binary types +/// @{ +/// @} + +/// \defgroup nested-builders Concrete builder subclasses for nested types +/// @{ +/// @} + +/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types +/// @{ +/// @} + +/// \defgroup run-end-encoded-builders Concrete builder subclasses for run-end encoded +/// arrays +/// @{ +/// @} + +constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; + +/// Base class for all data array builders. +/// +/// This class provides a facilities for incrementally building the null bitmap +/// (see Append methods) and as a side effect the current number of slots and +/// the null count. +/// +/// \note Users are expected to use builders as one of the concrete types below. +/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. +class ARROW_EXPORT ArrayBuilder { + public: + explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment) + : pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {} + + ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder); + + virtual ~ArrayBuilder() = default; + + /// For nested types. Since the objects are owned by this class instance, we + /// skip shared pointers and just return a raw pointer + ArrayBuilder* child(int i) { return children_[i].get(); } + + const std::shared_ptr& child_builder(int i) const { return children_[i]; } + + int num_children() const { return static_cast(children_.size()); } + + virtual int64_t length() const { return length_; } + int64_t null_count() const { return null_count_; } + int64_t capacity() const { return capacity_; } + + /// \brief Ensure that enough memory has been allocated to fit the indicated + /// number of total elements in the builder, including any that have already + /// been appended. Does not account for reallocations that may be due to + /// variable size data, like binary values. To make space for incremental + /// appends, use Reserve instead. + /// + /// \param[in] capacity the minimum number of total array values to + /// accommodate. Must be greater than the current capacity. + /// \return Status + virtual Status Resize(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to append the indicated + /// number of elements without any further reallocation. Overallocation is + /// used in order to minimize the impact of incremental Reserve() calls. + /// Note that additional_capacity is relative to the current number of elements + /// rather than to the current capacity, so calls to Reserve() which are not + /// interspersed with addition of new elements may not increase the capacity. + /// + /// \param[in] additional_capacity the number of additional array values + /// \return Status + Status Reserve(int64_t additional_capacity) { + auto current_capacity = capacity(); + auto min_capacity = length() + additional_capacity; + if (min_capacity <= current_capacity) return Status::OK(); + + // leave growth factor up to BufferBuilder + auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity); + return Resize(new_capacity); + } + + /// Reset the builder. + virtual void Reset(); + + /// \brief Append a null value to builder + virtual Status AppendNull() = 0; + /// \brief Append a number of null values to builder + virtual Status AppendNulls(int64_t length) = 0; + + /// \brief Append a non-null value to builder + /// + /// The appended value is an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending a null value to a parent nested type. + virtual Status AppendEmptyValue() = 0; + + /// \brief Append a number of non-null values to builder + /// + /// The appended values are an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending null values to a parent nested type. + virtual Status AppendEmptyValues(int64_t length) = 0; + + /// \brief Append a value from a scalar + Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); } + virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats); + virtual Status AppendScalars(const ScalarVector& scalars); + + /// \brief Append a range of values from an array. + /// + /// The given array must be the same type as the builder. + virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array), + int64_t ARROW_ARG_UNUSED(offset), + int64_t ARROW_ARG_UNUSED(length)) { + return Status::NotImplemented("AppendArraySlice for builder for ", *type()); + } + + /// \brief Return result of builder as an internal generic ArrayData + /// object. Resets builder except for dictionary builder + /// + /// \param[out] out the finalized ArrayData object + /// \return Status + virtual Status FinishInternal(std::shared_ptr* out) = 0; + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \param[out] out the finalized Array object + /// \return Status + Status Finish(std::shared_ptr* out); + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \return The finalized Array object + Result> Finish(); + + /// \brief Return the type of the built Array + virtual std::shared_ptr type() const = 0; + + protected: + /// Append to null bitmap + Status AppendToBitmap(bool is_valid); + + /// Vector append. Treat each zero byte as a null. If valid_bytes is null + /// assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + /// Uniform append. Append N times the same validity bit. + Status AppendToBitmap(int64_t num_bits, bool value); + + /// Set the next length bits to not null (i.e. valid). + Status SetNotNull(int64_t length); + + // Unsafe operations (don't check capacity/don't resize) + + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } + + // Append to null bitmap, update the length + void UnsafeAppendToBitmap(bool is_valid) { + null_bitmap_builder_.UnsafeAppend(is_valid); + ++length_; + if (!is_valid) ++null_count_; + } + + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + if (valid_bytes == NULLPTR) { + return UnsafeSetNotNull(length); + } + null_bitmap_builder_.UnsafeAppend(valid_bytes, length); + length_ += length; + null_count_ = null_bitmap_builder_.false_count(); + } + + // Vector append. Copy from a given bitmap. If bitmap is null assume + // all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) { + if (bitmap == NULLPTR) { + return UnsafeSetNotNull(length); + } + null_bitmap_builder_.UnsafeAppend(bitmap, offset, length); + length_ += length; + null_count_ = null_bitmap_builder_.false_count(); + } + + // Append the same validity value a given number of times. + void UnsafeAppendToBitmap(const int64_t num_bits, bool value) { + if (value) { + UnsafeSetNotNull(num_bits); + } else { + UnsafeSetNull(num_bits); + } + } + + void UnsafeAppendToBitmap(const std::vector& is_valid); + + // Set the next validity bits to not null (i.e. valid). + void UnsafeSetNotNull(int64_t length); + + // Set the next validity bits to null (i.e. invalid). + void UnsafeSetNull(int64_t length); + + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); + + /// \brief Finish to an array of the specified ArrayType + template + Status FinishTyped(std::shared_ptr* out) { + std::shared_ptr out_untyped; + ARROW_RETURN_NOT_OK(Finish(&out_untyped)); + *out = std::static_pointer_cast(std::move(out_untyped)); + return Status::OK(); + } + + // Check the requested capacity for validity + Status CheckCapacity(int64_t new_capacity) { + if (ARROW_PREDICT_FALSE(new_capacity < 0)) { + return Status::Invalid( + "Resize capacity must be positive (requested: ", new_capacity, ")"); + } + + if (ARROW_PREDICT_FALSE(new_capacity < length_)) { + return Status::Invalid("Resize cannot downsize (requested: ", new_capacity, + ", current length: ", length_, ")"); + } + + return Status::OK(); + } + + // Check for array type + Status CheckArrayType(const std::shared_ptr& expected_type, + const Array& array, const char* message); + Status CheckArrayType(Type::type expected_type, const Array& array, + const char* message); + + MemoryPool* pool_; + int64_t alignment_; + + TypedBufferBuilder null_bitmap_builder_; + int64_t null_count_ = 0; + + // Array length, so far. Also, the index of the next element to be added + int64_t length_ = 0; + int64_t capacity_ = 0; + + // Child value array builders. These are owned by this class + std::vector> children_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); +}; + +/// \brief Construct an empty ArrayBuilder corresponding to the data +/// type +/// \param[in] pool the MemoryPool to use for allocations +/// \param[in] type the data type to create the builder for +/// \param[out] out the created ArrayBuilder +ARROW_EXPORT +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::unique_ptr* out); + +inline Result> MakeBuilder( + const std::shared_ptr& type, MemoryPool* pool = default_memory_pool()) { + std::unique_ptr out; + ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out)); + return out; +} + +/// \brief Construct an empty ArrayBuilder corresponding to the data +/// type, where any top-level or nested dictionary builders return the +/// exact index type specified by the type. +ARROW_EXPORT +Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr& type, + std::unique_ptr* out); + +inline Result> MakeBuilderExactIndex( + const std::shared_ptr& type, MemoryPool* pool = default_memory_pool()) { + std::unique_ptr out; + ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out)); + return out; +} + +/// \brief Construct an empty DictionaryBuilder initialized optionally +/// with a preexisting dictionary +/// \param[in] pool the MemoryPool to use for allocations +/// \param[in] type the dictionary type to create the builder for +/// \param[in] dictionary the initial dictionary, if any. May be nullptr +/// \param[out] out the created ArrayBuilder +ARROW_EXPORT +Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::shared_ptr& dictionary, + std::unique_ptr* out); + +inline Result> MakeDictionaryBuilder( + const std::shared_ptr& type, const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool()) { + std::unique_ptr out; + ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out)); + return out; +} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_binary.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_binary.h new file mode 100644 index 0000000000000000000000000000000000000000..442e4a26320a2eab2e10b57735827e738bf07344 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_binary.h @@ -0,0 +1,971 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_binary.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/data.h" +#include "arrow/buffer.h" +#include "arrow/buffer_builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/binary_view_util.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup binary-builders +/// +/// @{ + +// ---------------------------------------------------------------------- +// Binary and String + +template +class BaseBinaryBuilder + : public ArrayBuilder, + public internal::ArrayBuilderExtraOps, std::string_view> { + public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + + explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + offsets_builder_(pool, alignment), + value_data_builder_(pool, alignment) {} + + BaseBinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + : BaseBinaryBuilder(pool) {} + + Status Append(const uint8_t* value, offset_type length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNextOffset(); + // Safety check for UBSAN. + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + } + + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status Append(const char* value, offset_type length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// Extend the last appended value by appending more data at the end + /// + /// Unlike Append, this does not create a new offset. + Status ExtendCurrent(const uint8_t* value, offset_type length) { + // Safety check for UBSAN. + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + } + return Status::OK(); + } + + Status ExtendCurrent(std::string_view value) { + return ExtendCurrent(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNulls(int64_t length) final { + const int64_t num_bytes = value_data_builder_.length(); + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + UnsafeAppendToBitmap(length, false); + return Status::OK(); + } + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNextOffset(); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNextOffset(); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + const int64_t num_bytes = value_data_builder_.length(); + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + UnsafeAppendToBitmap(length, true); + return Status::OK(); + } + + /// \brief Append without checking capacity + /// + /// Offsets and data should have been presized using Reserve() and + /// ReserveData(), respectively. + void UnsafeAppend(const uint8_t* value, offset_type length) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(value, length); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, offset_type length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + /// Like ExtendCurrent, but do not check capacity + void UnsafeExtendCurrent(const uint8_t* value, offset_type length) { + value_data_builder_.UnsafeAppend(value, length); + } + + void UnsafeExtendCurrent(std::string_view value) { + UnsafeExtendCurrent(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + void UnsafeAppendNull() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR) { + std::size_t total_length = std::accumulate( + values.begin(), values.end(), 0ULL, + [](uint64_t sum, const std::string& str) { return sum + str.size(); }); + ARROW_RETURN_NOT_OK(Reserve(values.size())); + ARROW_RETURN_NOT_OK(ReserveData(total_length)); + + if (valid_bytes != NULLPTR) { + for (std::size_t i = 0; i < values.size(); ++i) { + UnsafeAppendNextOffset(); + if (valid_bytes[i]) { + value_data_builder_.UnsafeAppend( + reinterpret_cast(values[i].data()), values[i].size()); + } + } + } else { + for (const auto& value : values) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(reinterpret_cast(value.data()), + value.size()); + } + } + + UnsafeAppendToBitmap(valid_bytes, values.size()); + return Status::OK(); + } + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + std::size_t total_length = 0; + std::vector value_lengths(length); + bool have_null_value = false; + for (int64_t i = 0; i < length; ++i) { + if (values[i] != NULLPTR) { + auto value_length = strlen(values[i]); + value_lengths[i] = value_length; + total_length += value_length; + } else { + have_null_value = true; + } + } + ARROW_RETURN_NOT_OK(Reserve(length)); + ARROW_RETURN_NOT_OK(ReserveData(total_length)); + + if (valid_bytes) { + int64_t valid_bytes_offset = 0; + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + if (valid_bytes[i]) { + if (values[i]) { + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + } else { + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, + i - valid_bytes_offset); + UnsafeAppendToBitmap(false); + valid_bytes_offset = i + 1; + } + } + } + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); + } else { + if (have_null_value) { + std::vector valid_vector(length, 0); + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + if (values[i]) { + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + valid_vector[i] = 1; + } + } + UnsafeAppendToBitmap(valid_vector.data(), length); + } else { + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + } + UnsafeAppendToBitmap(NULLPTR, length); + } + } + return Status::OK(); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + auto bitmap = array.GetValues(0, 0); + auto offsets = array.GetValues(1); + auto data = array.GetValues(2, 0); + auto total_length = offsets[offset + length] - offsets[offset]; + ARROW_RETURN_NOT_OK(Reserve(length)); + ARROW_RETURN_NOT_OK(ReserveData(total_length)); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) { + const offset_type start = offsets[offset + i]; + const offset_type end = offsets[offset + i + 1]; + UnsafeAppend(data + start, end - start); + } else { + UnsafeAppendNull(); + } + } + return Status::OK(); + } + + void Reset() override { + ArrayBuilder::Reset(); + offsets_builder_.Reset(); + value_data_builder_.Reset(); + } + + Status ValidateOverflow(int64_t new_bytes) { + auto new_size = value_data_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + // One more than requested for offsets + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + return ArrayBuilder::Resize(capacity); + } + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements) { + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return value_data_builder_.Reserve(elements); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Write final offset (values length) + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // These buffers' padding zeroed by BufferBuilder + std::shared_ptr offsets, value_data, null_bitmap; + ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); + ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + + *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data}, + null_count_, 0); + Reset(); + return Status::OK(); + } + + /// \return data pointer of the value date builder + const uint8_t* value_data() const { return value_data_builder_.data(); } + /// \return size of values buffer so far + int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + + /// \return data pointer of the value date builder + const offset_type* offsets_data() const { return offsets_builder_.data(); } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i, offset_type* out_length) const { + const offset_type* offsets = offsets_builder_.data(); + const auto offset = offsets[i]; + if (i == (length_ - 1)) { + *out_length = static_cast(value_data_builder_.length()) - offset; + } else { + *out_length = offsets[i + 1] - offset; + } + return value_data_builder_.data() + offset; + } + + offset_type offset(int64_t i) const { return offsets_data()[i]; } + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + std::string_view GetView(int64_t i) const { + offset_type value_length; + const uint8_t* value_data = GetValue(i, &value_length); + return std::string_view(reinterpret_cast(value_data), value_length); + } + + // Cannot make this a static attribute because of linking issues + static constexpr int64_t memory_limit() { + return std::numeric_limits::max() - 1; + } + + protected: + TypedBufferBuilder offsets_builder_; + TypedBufferBuilder value_data_builder_; + + Status AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + return offsets_builder_.Append(static_cast(num_bytes)); + } + + void UnsafeAppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } +}; + +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder { + public: + using BaseBinaryBuilder::BaseBinaryBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return binary(); } +}; + +/// \class StringBuilder +/// \brief Builder class for UTF8 strings +class ARROW_EXPORT StringBuilder : public BinaryBuilder { + public: + using BinaryBuilder::BinaryBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return utf8(); } +}; + +/// \class LargeBinaryBuilder +/// \brief Builder class for large variable-length binary data +class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder { + public: + using BaseBinaryBuilder::BaseBinaryBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return large_binary(); } +}; + +/// \class LargeStringBuilder +/// \brief Builder class for large UTF8 strings +class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { + public: + using LargeBinaryBuilder::LargeBinaryBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return large_utf8(); } +}; + +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// These builders do not support building raw pointer view arrays. + +namespace internal { + +// We allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB + + StringHeapBuilder(MemoryPool* pool, int64_t alignment) + : pool_(pool), alignment_(alignment) {} + + void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; } + + using c_type = BinaryViewType::c_type; + + template + std::conditional_t, c_type> Append(const uint8_t* value, + int64_t length) { + if (length <= BinaryViewType::kInlineSize) { + return util::ToInlineBinaryView(value, static_cast(length)); + } + + if constexpr (Safe) { + ARROW_RETURN_NOT_OK(Reserve(length)); + } + + auto v = util::ToNonInlineBinaryView(value, static_cast(length), + static_cast(blocks_.size() - 1), + current_offset_); + + memcpy(current_out_buffer_, value, static_cast(length)); + current_out_buffer_ += length; + current_remaining_bytes_ -= length; + current_offset_ += static_cast(length); + return v; + } + + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits::max(); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 2GB"); + } + if (num_bytes > current_remaining_bytes_) { + ARROW_RETURN_NOT_OK(FinishLastBlock()); + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr new_block, + AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_)); + current_offset_ = 0; + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + Result>> Finish() { + if (!blocks_.empty()) { + ARROW_RETURN_NOT_OK(FinishLastBlock()); + } + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + Status FinishLastBlock() { + if (current_remaining_bytes_ > 0) { + // Avoid leaking uninitialized bytes from the allocator + ARROW_RETURN_NOT_OK( + blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_, + /*shrink_to_fit=*/true)); + blocks_.back()->ZeroPadding(); + } + return Status::OK(); + } + + MemoryPool* pool_; + int64_t alignment_; + int64_t blocksize_ = kDefaultBlocksize; + std::vector> blocks_; + + int32_t current_offset_ = 0; + uint8_t* current_out_buffer_ = NULLPTR; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool); + + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool, alignment) {} + + /// Set the size for future preallocated data buffers. + /// + /// The default size is 32KB, so after each 32KB of string data appended to the builder + /// a new data buffer will be allocated. Adjust this to a larger value to decrease the + /// frequency of allocation, or to a smaller value to lower the overhead of each + /// allocation. + void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); } + + /// The number of bytes which can be appended to this builder without allocating another + /// data buffer. + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + ARROW_ASSIGN_OR_RAISE(auto v, + data_heap_builder_.Append(value, length)); + data_builder_.UnsafeAppend(v); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 2GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + UnsafeAppendToBitmap(true); + auto v = data_heap_builder_.Append(value, length); + data_builder_.UnsafeAppend(v); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override { return binary_view(); } + + protected: + TypedBufferBuilder data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr type() const override { return utf8_view(); } +}; + +// ---------------------------------------------------------------------- +// FixedSizeBinaryBuilder + +class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { + public: + using TypeClass = FixedSizeBinaryType; + + explicit FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + Status Append(const uint8_t* value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(value); + return Status::OK(); + } + + Status Append(const char* value) { + return Append(reinterpret_cast(value)); + } + + Status Append(std::string_view view) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(view); + return Status::OK(); + } + + Status Append(const std::string& s) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(s); + return Status::OK(); + } + + Status Append(const Buffer& s) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(s); + return Status::OK(); + } + + Status Append(const std::shared_ptr& s) { return Append(*s); } + + template + Status Append(const std::array& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend( + std::string_view(reinterpret_cast(value.data()), value.size())); + return Status::OK(); + } + + Status AppendValues(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity, + int64_t bitmap_offset); + + Status AppendNull() final; + Status AppendNulls(int64_t length) final; + + Status AppendEmptyValue() final; + Status AppendEmptyValues(int64_t length) final; + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + return AppendValues( + array.GetValues(1, 0) + ((array.offset + offset) * byte_width_), length, + array.GetValues(0, 0), array.offset + offset); + } + + void UnsafeAppend(const uint8_t* value) { + UnsafeAppendToBitmap(true); + if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { + byte_builder_.UnsafeAppend(value, byte_width_); + } + } + + void UnsafeAppend(const char* value) { + UnsafeAppend(reinterpret_cast(value)); + } + + void UnsafeAppend(std::string_view value) { +#ifndef NDEBUG + CheckValueSize(static_cast(value.size())); +#endif + UnsafeAppend(reinterpret_cast(value.data())); + } + + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); } + + void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } + + void UnsafeAppendNull() { + UnsafeAppendToBitmap(false); + byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); + } + + Status ValidateOverflow(int64_t new_bytes) const { + auto new_size = byte_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements) { + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return byte_builder_.Reserve(elements); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \return size of values buffer so far + int64_t value_data_length() const { return byte_builder_.length(); } + + int32_t byte_width() const { return byte_width_; } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + std::string_view GetView(int64_t i) const; + + static constexpr int64_t memory_limit() { + return std::numeric_limits::max() - 1; + } + + std::shared_ptr type() const override { + return fixed_size_binary(byte_width_); + } + + protected: + int32_t byte_width_; + BufferBuilder byte_builder_; + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + uint8_t* GetMutableValue(int64_t i) { + uint8_t* data_ptr = byte_builder_.mutable_data(); + return data_ptr + i * byte_width_; + } + + void CheckValueSize(int64_t size); +}; + +/// @} + +// ---------------------------------------------------------------------- +// Chunked builders: build a sequence of BinaryArray or StringArray that are +// limited to a particular size (to the upper limit of 2GB) + +namespace internal { + +class ARROW_EXPORT ChunkedBinaryBuilder { + public: + explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length, + MemoryPool* pool = default_memory_pool()); + + ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length, + MemoryPool* pool = default_memory_pool()); + + virtual ~ChunkedBinaryBuilder() = default; + + Status Append(const uint8_t* value, int32_t length) { + if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() > + max_chunk_value_length_)) { + if (builder_->value_data_length() == 0) { + // The current item is larger than max_chunk_size_; + // this chunk will be oversize and hold *only* this item + ARROW_RETURN_NOT_OK(builder_->Append(value, length)); + return NextChunk(); + } + // The current item would cause builder_->value_data_length() to exceed + // max_chunk_size_, so finish this chunk and append the current item to the next + // chunk + ARROW_RETURN_NOT_OK(NextChunk()); + return Append(value, length); + } + + if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) { + // The current item would cause builder_->length() to exceed max_chunk_length_, so + // finish this chunk and append the current item to the next chunk + ARROW_RETURN_NOT_OK(NextChunk()); + } + + return builder_->Append(value, length); + } + + Status Append(std::string_view value) { + return Append(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNull() { + if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + return builder_->AppendNull(); + } + + Status Reserve(int64_t values); + + virtual Status Finish(ArrayVector* out); + + protected: + Status NextChunk(); + + // maximum total character data size per chunk + int64_t max_chunk_value_length_; + + // maximum elements allowed per chunk + int64_t max_chunk_length_ = kListMaximumElements; + + // when Reserve() would cause builder_ to exceed its max_chunk_length_, + // add to extra_capacity_ instead and wait to reserve until the next chunk + int64_t extra_capacity_ = 0; + + std::unique_ptr builder_; + std::vector> chunks_; +}; + +class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { + public: + using ChunkedBinaryBuilder::ChunkedBinaryBuilder; + + Status Finish(ArrayVector* out) override; +}; + +} // namespace internal + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_decimal.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_decimal.h new file mode 100644 index 0000000000000000000000000000000000000000..a0bf0a04220842cceada0d0754ad6be4e41a3093 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_decimal.h @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/array_decimal.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/data.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup numeric-builders +/// +/// @{ + +class ARROW_EXPORT Decimal32Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal32Type; + using ValueType = Decimal32; + + explicit Decimal32Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(Decimal32 val); + void UnsafeAppend(Decimal32 val); + void UnsafeAppend(std::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + +class ARROW_EXPORT Decimal64Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal64Type; + using ValueType = Decimal64; + + explicit Decimal64Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(Decimal64 val); + void UnsafeAppend(Decimal64 val); + void UnsafeAppend(std::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + +class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal128Type; + using ValueType = Decimal128; + + explicit Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(Decimal128 val); + void UnsafeAppend(Decimal128 val); + void UnsafeAppend(std::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + +class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal256Type; + using ValueType = Decimal256; + + explicit Decimal256Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(const Decimal256& val); + void UnsafeAppend(const Decimal256& val); + void UnsafeAppend(std::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + +using DecimalBuilder = Decimal128Builder; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_dict.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_dict.h new file mode 100644 index 0000000000000000000000000000000000000000..116c82049eea9ea49a716452090297f57be4eb6b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_dict.h @@ -0,0 +1,728 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_binary.h" +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export +#include "arrow/array/builder_primitive.h" // IWYU pragma: export +#include "arrow/array/data.h" +#include "arrow/array/util.h" +#include "arrow/scalar.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Dictionary builder + +namespace internal { + +template +struct DictionaryValue { + using type = typename T::c_type; + using PhysicalType = T; +}; + +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = + typename std::conditional::value, + BinaryType, LargeBinaryType>::type; +}; + +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryType; +}; + +class ARROW_EXPORT DictionaryMemoTable { + public: + DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr& type); + DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr& dictionary); + ~DictionaryMemoTable(); + + Status GetArrayData(int64_t start_offset, std::shared_ptr* out); + + /// \brief Insert new memo values + Status InsertValues(const Array& values); + + int32_t size() const; + + template + Status GetOrInsert(typename DictionaryValue::type value, int32_t* out) { + // We want to keep the DictionaryMemoTable implementation private, also we can't + // use extern template classes because of compiler issues (MinGW?). Instead, + // we expose explicit function overrides for each supported physical type. + const typename DictionaryValue::PhysicalType* physical_type = NULLPTR; + return GetOrInsert(physical_type, value, out); + } + + private: + Status GetOrInsert(const BooleanType*, bool value, int32_t* out); + Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out); + Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out); + Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out); + Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out); + Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out); + Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out); + Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out); + Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out); + Status GetOrInsert(const DurationType*, int64_t value, int32_t* out); + Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out); + Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out); + Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out); + Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out); + Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out); + Status GetOrInsert(const MonthDayNanoIntervalType*, + MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out); + Status GetOrInsert(const DayTimeIntervalType*, + DayTimeIntervalType::DayMilliseconds value, int32_t* out); + Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out); + Status GetOrInsert(const FloatType*, float value, int32_t* out); + Status GetOrInsert(const DoubleType*, double value, int32_t* out); + + Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); + Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); + + class DictionaryMemoTableImpl; + std::unique_ptr impl_; +}; + +} // namespace internal + +/// \addtogroup dictionary-builders +/// +/// @{ + +namespace internal { + +/// \brief Array builder for created encoded DictionaryArray from +/// dense array +/// +/// Unlike other builders, dictionary builder does not completely +/// reset the state on Finish calls. +template +class DictionaryBuilderBase : public ArrayBuilder { + public: + using TypeClass = DictionaryType; + using Value = typename DictionaryValue::type; + + // WARNING: the type given below is the value type, not the DictionaryType. + // The DictionaryType is instantiated on the Finish() call. + template + DictionaryBuilderBase(uint8_t start_int_size, + enable_if_t::value && + !is_fixed_size_binary_type::value, + const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(start_int_size, pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + enable_if_t::value, const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + const std::shared_ptr& index_type, + enable_if_t::value, const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(index_type, pool, alignment), + value_type_(value_type) {} + + template + DictionaryBuilderBase(uint8_t start_int_size, + enable_if_t::value && + is_fixed_size_binary_type::value, + const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(static_cast(*value_type).byte_width()), + indices_builder_(start_int_size, pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + enable_if_fixed_size_binary&> value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(static_cast(*value_type).byte_width()), + indices_builder_(pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + const std::shared_ptr& index_type, + enable_if_fixed_size_binary&> value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(static_cast(*value_type).byte_width()), + indices_builder_(index_type, pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + enable_if_parameter_free pool = default_memory_pool()) + : DictionaryBuilderBase(TypeTraits::type_singleton(), pool) {} + + // This constructor doesn't check for errors. Use InsertMemoValues instead. + explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, dictionary)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(pool, alignment), + value_type_(dictionary->type()) {} + + ~DictionaryBuilderBase() override = default; + + /// \brief The current number of entries in the dictionary + int64_t dictionary_length() const { return memo_table_->size(); } + + /// \brief The value byte width (for FixedSizeBinaryType) + template + enable_if_fixed_size_binary byte_width() const { + return byte_width_; + } + + /// \brief Append a scalar value + Status Append(Value value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + + int32_t memo_index; + ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert(value, &memo_index)); + ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index)); + length_ += 1; + + return Status::OK(); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + enable_if_fixed_size_binary Append(const uint8_t* value) { + return Append(std::string_view(reinterpret_cast(value), byte_width_)); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + enable_if_fixed_size_binary Append(const char* value) { + return Append(std::string_view(value, byte_width_)); + } + + /// \brief Append a string (only for binary types) + template + enable_if_binary_like Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + /// \brief Append a string (only for binary types) + template + enable_if_binary_like Append(const char* value, int32_t length) { + return Append(std::string_view(value, length)); + } + + /// \brief Append a string (only for string types) + template + enable_if_string_like Append(const char* value, int32_t length) { + return Append(std::string_view(value, length)); + } + + /// \brief Append a decimal (only for Decimal32/64/128/256 Type) + template ::CType> + enable_if_decimal Append(const CType& value) { + auto bytes = value.ToBytes(); + return Append(bytes.data(), static_cast(bytes.size())); + } + + /// \brief Append a scalar null value + Status AppendNull() final { + length_ += 1; + null_count_ += 1; + + return indices_builder_.AppendNull(); + } + + Status AppendNulls(int64_t length) final { + length_ += length; + null_count_ += length; + + return indices_builder_.AppendNulls(length); + } + + Status AppendEmptyValue() final { + length_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + + Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override { + if (!scalar.is_valid) return AppendNulls(n_repeats); + + const auto& dict_ty = internal::checked_cast(*scalar.type); + const DictionaryScalar& dict_scalar = + internal::checked_cast(scalar); + const auto& dict = internal::checked_cast::ArrayType&>( + *dict_scalar.value.dictionary); + ARROW_RETURN_NOT_OK(Reserve(n_repeats)); + switch (dict_ty.index_type()->id()) { + case Type::UINT8: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::INT8: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::UINT16: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::INT16: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::UINT32: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::INT32: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::UINT64: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::INT64: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + default: + return Status::TypeError("Invalid index type: ", dict_ty); + } + return Status::OK(); + } + + Status AppendScalars(const ScalarVector& scalars) override { + for (const auto& scalar : scalars) { + ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1)); + } + return Status::OK(); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final { + // Visit the indices and insert the unpacked values. + const auto& dict_ty = internal::checked_cast(*array.type); + // See if possible to avoid using ToArrayData here + const typename TypeTraits::ArrayType dict(array.dictionary().ToArrayData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + switch (dict_ty.index_type()->id()) { + case Type::UINT8: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::INT8: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::UINT16: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::INT16: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::UINT32: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::INT32: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::UINT64: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::INT64: + return AppendArraySliceImpl(dict, array, offset, length); + default: + return Status::TypeError("Invalid index type: ", dict_ty); + } + return Status::OK(); + } + + /// \brief Insert values into the dictionary's memo, but do not append any + /// indices. Can be used to initialize a new builder with known dictionary + /// values + /// \param[in] values dictionary values to add to memo. Type must match + /// builder type + Status InsertMemoValues(const Array& values) { + return memo_table_->InsertValues(values); + } + + /// \brief Append a whole dense array to the builder + template + enable_if_t::value, Status> AppendArray( + const Array& array) { + using ArrayType = typename TypeTraits::ArrayType; + +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + value_type_, array, "Wrong value type of array to be appended")); +#endif + + const auto& concrete_array = static_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + ARROW_RETURN_NOT_OK(AppendNull()); + } else { + ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i))); + } + } + return Status::OK(); + } + + template + enable_if_fixed_size_binary AppendArray(const Array& array) { +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + value_type_, array, "Wrong value type of array to be appended")); +#endif + + const auto& concrete_array = static_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + ARROW_RETURN_NOT_OK(AppendNull()); + } else { + ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i))); + } + } + return Status::OK(); + } + + void Reset() override { + // Perform a partial reset. Call ResetFull to also reset the accumulated + // dictionary values + ArrayBuilder::Reset(); + indices_builder_.Reset(); + } + + /// \brief Reset and also clear accumulated dictionary values in memo table + void ResetFull() { + Reset(); + memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_)); + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); + capacity_ = indices_builder_.capacity(); + return Status::OK(); + } + + /// \brief Return dictionary indices and a delta dictionary since the last + /// time that Finish or FinishDelta were called, and reset state of builder + /// (except the memo table) + Status FinishDelta(std::shared_ptr* out_indices, + std::shared_ptr* out_delta) { + std::shared_ptr indices_data; + std::shared_ptr delta_data; + ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data)); + *out_indices = MakeArray(indices_data); + *out_delta = MakeArray(delta_data); + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { + return ::arrow::dictionary(indices_builder_.type(), value_type_); + } + + protected: + template + Status AppendArraySliceImpl(const typename TypeTraits::ArrayType& dict, + const ArraySpan& array, int64_t offset, int64_t length) { + const c_type* values = array.GetValues(1) + offset; + return VisitBitBlocks( + array.buffers[0].data, array.offset + offset, length, + [&](const int64_t position) { + const int64_t index = static_cast(values[position]); + if (dict.IsValid(index)) { + return Append(dict.GetView(index)); + } + return AppendNull(); + }, + [&]() { return AppendNull(); }); + } + + template + Status AppendScalarImpl(const typename TypeTraits::ArrayType& dict, + const Scalar& index_scalar, int64_t n_repeats) { + using ScalarType = typename TypeTraits::ScalarType; + const auto index = internal::checked_cast(index_scalar).value; + if (index_scalar.is_valid && dict.IsValid(index)) { + const auto& value = dict.GetView(index); + for (int64_t i = 0; i < n_repeats; i++) { + ARROW_RETURN_NOT_OK(Append(value)); + } + return Status::OK(); + } + return AppendNulls(n_repeats); + } + + Status FinishInternal(std::shared_ptr* out) override { + std::shared_ptr dictionary; + ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary)); + + // Set type of array data to the right dictionary type + (*out)->type = type(); + (*out)->dictionary = dictionary; + return Status::OK(); + } + + Status FinishWithDictOffset(int64_t dict_offset, + std::shared_ptr* out_indices, + std::shared_ptr* out_dictionary) { + // Finalize indices array + ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices)); + + // Generate dictionary array from hash table contents + ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary)); + delta_offset_ = memo_table_->size(); + + // Update internals for further uses of this DictionaryBuilder + ArrayBuilder::Reset(); + return Status::OK(); + } + + std::unique_ptr memo_table_; + + // The size of the dictionary memo at last invocation of Finish, to use in + // FinishDelta for computing dictionary deltas + int32_t delta_offset_; + + // Only used for FixedSizeBinaryType + int32_t byte_width_; + + BuilderType indices_builder_; + std::shared_ptr value_type_; +}; + +template +class DictionaryBuilderBase : public ArrayBuilder { + public: + template + DictionaryBuilderBase( + enable_if_t::value, uint8_t> + start_int_size, + const std::shared_ptr& value_type, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} + + explicit DictionaryBuilderBase(const std::shared_ptr& value_type, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(pool) {} + + explicit DictionaryBuilderBase(const std::shared_ptr& index_type, + const std::shared_ptr& value_type, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(index_type, pool) {} + + template + explicit DictionaryBuilderBase( + enable_if_t::value, uint8_t> + start_int_size, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} + + explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(pool) {} + + explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(pool) {} + + /// \brief Append a scalar null value + Status AppendNull() final { + length_ += 1; + null_count_ += 1; + + return indices_builder_.AppendNull(); + } + + Status AppendNulls(int64_t length) final { + length_ += length; + null_count_ += length; + + return indices_builder_.AppendNulls(length); + } + + Status AppendEmptyValue() final { + length_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array) { +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + Type::NA, array, "Wrong value type of array to be appended")); +#endif + for (int64_t i = 0; i < array.length(); i++) { + ARROW_RETURN_NOT_OK(AppendNull()); + } + return Status::OK(); + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + + ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); + capacity_ = indices_builder_.capacity(); + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out)); + (*out)->type = dictionary((*out)->type, null()); + (*out)->dictionary = NullArray(0).data(); + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { + return ::arrow::dictionary(indices_builder_.type(), null()); + } + + protected: + BuilderType indices_builder_; +}; + +} // namespace internal + +/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the +/// smallest index size that can accommodate the dictionary indices +template +class DictionaryBuilder : public internal::DictionaryBuilderBase { + public: + using BASE = internal::DictionaryBuilderBase; + using BASE::BASE; + + /// \brief Append dictionary indices directly without modifying memo + /// + /// NOTE: Experimental API + Status AppendIndices(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + int64_t null_count_before = this->indices_builder_.null_count(); + ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); + this->capacity_ = this->indices_builder_.capacity(); + this->length_ += length; + this->null_count_ += this->indices_builder_.null_count() - null_count_before; + return Status::OK(); + } +}; + +/// \brief A DictionaryArray builder that always returns int32 dictionary +/// indices so that data cast to dictionary form will have a consistent index +/// type, e.g. for creating a ChunkedArray +template +class Dictionary32Builder : public internal::DictionaryBuilderBase { + public: + using BASE = internal::DictionaryBuilderBase; + using BASE::BASE; + + /// \brief Append dictionary indices directly without modifying memo + /// + /// NOTE: Experimental API + Status AppendIndices(const int32_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + int64_t null_count_before = this->indices_builder_.null_count(); + ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); + this->capacity_ = this->indices_builder_.capacity(); + this->length_ += length; + this->null_count_ += this->indices_builder_.null_count() - null_count_before; + return Status::OK(); + } +}; + +// ---------------------------------------------------------------------- +// Binary / Unicode builders +// (compatibility aliases; those used to be derived classes with additional +// Append() overloads, but they have been folded into DictionaryBuilderBase) + +using BinaryDictionaryBuilder = DictionaryBuilder; +using StringDictionaryBuilder = DictionaryBuilder; +using BinaryDictionary32Builder = Dictionary32Builder; +using StringDictionary32Builder = Dictionary32Builder; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_nested.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_nested.h new file mode 100644 index 0000000000000000000000000000000000000000..d0e5b6d3c0edf2e0fc31dd9e1a3ca1fd22bf910b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_nested.h @@ -0,0 +1,836 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/data.h" +#include "arrow/buffer.h" +#include "arrow/buffer_builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup nested-builders +/// +/// @{ + +// ---------------------------------------------------------------------- +// VarLengthListLikeBuilder + +template +class VarLengthListLikeBuilder : public ArrayBuilder { + public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + + /// Use this constructor to incrementally build the value array along with offsets and + /// null bitmap. + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + offsets_builder_(pool, alignment), + value_builder_(value_builder), + value_field_(type->field(0)->WithType(NULLPTR)) {} + + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int64_t alignment = kDefaultBufferAlignment) + : VarLengthListLikeBuilder(pool, value_builder, + std::make_shared(value_builder->type()), + alignment) {} + + ~VarLengthListLikeBuilder() override = default; + + Status Resize(int64_t capacity) override { + if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) { + return Status::CapacityError(type_name(), + " array cannot reserve space for more than ", + maximum_elements(), " got ", capacity); + } + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + + // One more than requested for list offsets + const int64_t offsets_capacity = + is_list_view(TYPE::type_id) ? capacity : capacity + 1; + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity)); + return ArrayBuilder::Resize(capacity); + } + + void Reset() override { + ArrayBuilder::Reset(); + offsets_builder_.Reset(); + value_builder_->Reset(); + } + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before appending elements to the + /// value builder. Elements appended to the value builder before this function + /// is called for the first time, will not be members of any list value. + /// + /// After this function is called, list_length elements SHOULD be appended to + /// the values builder. If this contract is violated, the behavior is defined by + /// the concrete builder implementation and SHOULD NOT be relied upon unless + /// the caller is specifically building a [Large]List or [Large]ListView array. + /// + /// For [Large]List arrays, the list slot length will be the number of elements + /// appended to the values builder before the next call to Append* or Finish. For + /// [Large]ListView arrays, the list slot length will be exactly list_length, but if + /// Append* is called before at least list_length elements are appended to the values + /// builder, the current list slot will share elements with the next list + /// slots or an invalid [Large]ListView array will be generated because there + /// aren't enough elements in the values builder to fill the list slots. + /// + /// If you're building a [Large]List and don't need to be compatible + /// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)` + /// is a simpler API. + /// + /// \pre if is_valid is false, list_length MUST be 0 + /// \param is_valid Whether the new list slot is valid + /// \param list_length The number of elements in the list + Status Append(bool is_valid, int64_t list_length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + assert(is_valid || list_length == 0); + UnsafeAppendToBitmap(is_valid); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length); + return Status::OK(); + } + + Status AppendNull() final { + // Append() a null list slot with list_length=0. + // + // When building [Large]List arrays, elements being appended to the values builder + // before the next call to Append* or Finish will extend the list slot length, but + // that is totally fine because list arrays admit non-empty null list slots. + // + // In the case of [Large]ListViews that's not a problem either because the + // list slot length remains zero. + return Append(false, 0); + } + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, false); + UnsafeAppendEmptyDimensions(/*num_values=*/length); + return Status::OK(); + } + + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure list slot remains empty + Status AppendEmptyValue() final { return Append(true, 0); } + + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure the last list slot remains empty + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + UnsafeAppendEmptyDimensions(/*num_values=*/length); + return Status::OK(); + } + + /// \brief Vector append + /// + /// For list-array builders, the sizes are inferred from the offsets. + /// BaseListBuilder provides an implementation that doesn't take sizes, but + /// this virtual function allows dispatching calls to both list-array and + /// list-view-array builders (which need the sizes) + /// + /// \param offsets The offsets of the variable-length lists + /// \param sizes The sizes of the variable-length lists + /// \param length The number of offsets, sizes, and validity bits to append + /// \param valid_bytes If passed, valid_bytes is of equal length to values, + /// and any zero byte will be considered as a null for that slot + virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) = 0; + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + const offset_type* offsets = array.GetValues(1); + [[maybe_unused]] const offset_type* sizes = NULLPTR; + if constexpr (is_list_view(TYPE::type_id)) { + sizes = array.GetValues(2); + } + static_assert(internal::may_have_validity_bitmap(TYPE::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t row = offset; row < offset + length; row++) { + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); + int64_t size = 0; + if (is_valid) { + if constexpr (is_list_view(TYPE::type_id)) { + size = sizes[row]; + } else { + size = offsets[row + 1] - offsets[row]; + } + } + UnsafeAppendToBitmap(is_valid); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size); + if (is_valid) { + ARROW_RETURN_NOT_OK( + value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size)); + } + } + return Status::OK(); + } + + Status ValidateOverflow(int64_t new_elements) const { + auto new_length = value_builder_->length() + new_elements; + if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { + return Status::CapacityError(type_name(), " array cannot contain more than ", + maximum_elements(), " elements, have ", new_elements); + } else { + return Status::OK(); + } + } + + ArrayBuilder* value_builder() const { return value_builder_.get(); } + + // Cannot make this a static attribute because of linking issues + static constexpr int64_t maximum_elements() { + return std::numeric_limits::max() - 1; + } + + std::shared_ptr type() const override { + return std::make_shared(value_field_->WithType(value_builder_->type())); + } + + private: + static constexpr const char* type_name() { + if constexpr (is_list_view(TYPE::type_id)) { + return "ListView"; + } else { + return "List"; + } + } + + protected: + /// \brief Append dimensions for num_values empty list slots. + /// + /// ListViewBuilder overrides this to also append the sizes. + virtual void UnsafeAppendEmptyDimensions(int64_t num_values) { + const int64_t offset = value_builder_->length(); + for (int64_t i = 0; i < num_values; ++i) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + } + + /// \brief Append dimensions for a single list slot. + /// + /// ListViewBuilder overrides this to also append the size. + virtual void UnsafeAppendDimensions(int64_t offset, int64_t ARROW_ARG_UNUSED(size)) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + + TypedBufferBuilder offsets_builder_; + std::shared_ptr value_builder_; + std::shared_ptr value_field_; +}; + +// ---------------------------------------------------------------------- +// ListBuilder / LargeListBuilder + +template +class BaseListBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + using BASE::Append; + + ~BaseListBuilder() override = default; + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true) { + // The value_length parameter to BASE::Append(bool, int64_t) is ignored when + // building a list array, so we can pass 0 here. + return BASE::Append(is_valid, 0); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + // Offsets are assumed to be valid, but the first length-1 sizes have to be + // consistent with the offsets to partially rule out the possibility that the + // caller is passing sizes that could work if building a list-view, but don't + // work on building a list that requires offsets to be non-decreasing. + // + // CAUTION: the last size element (`sizes[length - 1]`) is not + // validated and could be inconsistent with the offsets given in a + // subsequent call to AppendValues. +#ifndef NDEBUG + if (sizes) { + for (int64_t i = 0; i < length - 1; ++i) { + if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) { + if (!valid_bytes || valid_bytes[i]) { + return Status::Invalid( + "BaseListBuilder: sizes are inconsistent with offsets provided"); + } + } + } + } +#endif + return AppendValues(offsets, length, valid_bytes); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + + Status AppendNextOffset() { + ARROW_RETURN_NOT_OK(this->ValidateOverflow(0)); + const int64_t num_values = this->value_builder_->length(); + return this->offsets_builder_.Append(static_cast(num_values)); + } + + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + std::shared_ptr null_bitmap; + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); + } +}; + +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offsets and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT ListBuilder : public BaseListBuilder { + public: + using BaseListBuilder::BaseListBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +/// \class LargeListBuilder +/// \brief Builder class for large variable-length list array value types +/// +/// Like ListBuilder, but to create large list arrays (with 64-bit offsets). +class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { + public: + using BaseListBuilder::BaseListBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +// ---------------------------------------------------------------------- +// ListViewBuilder / LargeListViewBuilder + +template +class BaseListViewBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + ~BaseListViewBuilder() override = default; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(BASE::Resize(capacity)); + return sizes_builder_.Resize(capacity); + } + + void Reset() override { + BASE::Reset(); + sizes_builder_.Reset(); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + this->sizes_builder_.UnsafeAppend(sizes, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Offset and sizes padding zeroed by BufferBuilder + std::shared_ptr null_bitmap; + std::shared_ptr offsets; + std::shared_ptr sizes; + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets), std::move(sizes)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); + } + + protected: + void UnsafeAppendEmptyDimensions(int64_t num_values) override { + for (int64_t i = 0; i < num_values; ++i) { + this->offsets_builder_.UnsafeAppend(0); + } + for (int64_t i = 0; i < num_values; ++i) { + this->sizes_builder_.UnsafeAppend(0); + } + } + + void UnsafeAppendDimensions(int64_t offset, int64_t size) override { + this->offsets_builder_.UnsafeAppend(static_cast(offset)); + this->sizes_builder_.UnsafeAppend(static_cast(size)); + } + + private: + TypedBufferBuilder sizes_builder_; +}; + +class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +class ARROW_EXPORT LargeListViewBuilder final + : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +// ---------------------------------------------------------------------- +// Map builder + +/// \class MapBuilder +/// \brief Builder class for arrays of variable-size maps +/// +/// To use this class, you must use the Append function to delimit each distinct +/// map before appending values to the key and item array builders, or use the +/// bulk API to append a sequence of offsets and null maps. +/// +/// Key uniqueness and ordering are not validated. +class ARROW_EXPORT MapBuilder : public ArrayBuilder { + public: + /// Use this constructor to define the built array's type explicitly. If key_builder + /// or item_builder has indeterminate type, this builder will also. + MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, + const std::shared_ptr& item_builder, + const std::shared_ptr& type); + + /// Use this constructor to infer the built array's type. If key_builder or + /// item_builder has indeterminate type, this builder will also. + MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, + const std::shared_ptr& item_builder, bool keys_sorted = false); + + MapBuilder(MemoryPool* pool, const std::shared_ptr& item_builder, + const std::shared_ptr& type); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Start a new variable-length map slot + /// + /// This function should be called before beginning to append elements to the + /// key and item builders + Status Append(); + + Status AppendNull() final; + + Status AppendNulls(int64_t length) final; + + Status AppendEmptyValue() final; + + Status AppendEmptyValues(int64_t length) final; + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + const auto* offsets = array.GetValues(1); + static_assert(internal::may_have_validity_bitmap(MapType::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + for (int64_t row = offset; row < offset + length; row++) { + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); + if (is_valid) { + ARROW_RETURN_NOT_OK(Append()); + const int64_t slot_length = offsets[row + 1] - offsets[row]; + // Add together the inner StructArray offset to the Map/List offset + int64_t key_value_offset = array.child_data[0].offset + offsets[row]; + ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice( + array.child_data[0].child_data[0], key_value_offset, slot_length)); + ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice( + array.child_data[0].child_data[1], key_value_offset, slot_length)); + } else { + ARROW_RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + + /// \brief Get builder to append keys. + /// + /// Append a key with this builder should be followed by appending + /// an item or null value with item_builder(). + ArrayBuilder* key_builder() const { return key_builder_.get(); } + + /// \brief Get builder to append items + /// + /// Appending an item with this builder should have been preceded + /// by appending a key with key_builder(). + ArrayBuilder* item_builder() const { return item_builder_.get(); } + + /// \brief Get builder to add Map entries as struct values. + /// + /// This is used instead of key_builder()/item_builder() and allows + /// the Map to be built as a list of struct values. + ArrayBuilder* value_builder() const { return list_builder_->value_builder(); } + + std::shared_ptr type() const override { + // Key and Item builder may update types, but they don't contain the field names, + // so we need to reconstruct the type. (See ARROW-13735.) + return std::make_shared( + field(entries_name_, + struct_({field(key_name_, key_builder_->type(), false), + field(item_name_, item_builder_->type(), item_nullable_)}), + false), + keys_sorted_); + } + + Status ValidateOverflow(int64_t new_elements) { + return list_builder_->ValidateOverflow(new_elements); + } + + protected: + inline Status AdjustStructBuilderLength(); + + protected: + bool keys_sorted_ = false; + bool item_nullable_ = false; + std::string entries_name_; + std::string key_name_; + std::string item_name_; + std::shared_ptr list_builder_; + std::shared_ptr key_builder_; + std::shared_ptr item_builder_; +}; + +// ---------------------------------------------------------------------- +// FixedSizeList builder + +/// \class FixedSizeListBuilder +/// \brief Builder class for fixed-length list array value types +class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { + public: + using TypeClass = FixedSizeListType; + + /// Use this constructor to define the built array's type explicitly. If value_builder + /// has indeterminate type, this builder will also. + FixedSizeListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int32_t list_size); + + /// Use this constructor to infer the built array's type. If value_builder has + /// indeterminate type, this builder will also. + FixedSizeListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Append a valid fixed length list. + /// + /// This function affects only the validity bitmap; the child values must be appended + /// using the child array builder. + Status Append(); + + /// \brief Vector append + /// + /// If passed, valid_bytes will be read and any zero byte + /// will cause the corresponding slot to be null + /// + /// This function affects only the validity bitmap; the child values must be appended + /// using the child array builder. This includes appending nulls for null lists. + /// XXX this restriction is confusing, should this method be omitted? + Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a null fixed length list. + /// + /// The child array builder will have the appropriate number of nulls appended + /// automatically. + Status AppendNull() final; + + /// \brief Append length null fixed length lists. + /// + /// The child array builder will have the appropriate number of nulls appended + /// automatically. + Status AppendNulls(int64_t length) final; + + Status ValidateOverflow(int64_t new_elements); + + Status AppendEmptyValue() final; + + Status AppendEmptyValues(int64_t length) final; + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final { + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || bit_util::GetBit(validity, array.offset + row)) { + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice( + array.child_data[0], list_size_ * (array.offset + row), list_size_)); + ARROW_RETURN_NOT_OK(Append()); + } else { + ARROW_RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + + ArrayBuilder* value_builder() const { return value_builder_.get(); } + + std::shared_ptr type() const override { + return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_); + } + + // Cannot make this a static attribute because of linking issues + static constexpr int64_t maximum_elements() { + return std::numeric_limits::max() - 1; + } + + protected: + std::shared_ptr value_field_; + const int32_t list_size_; + std::shared_ptr value_builder_; +}; + +// ---------------------------------------------------------------------- +// Struct + +// --------------------------------------------------------------------------------- +// StructArray builder +/// Append, Resize and Reserve methods are acting on StructBuilder. +/// Please make sure all these methods of all child-builders' are consistently +/// called to maintain data-structure consistency. +class ARROW_EXPORT StructBuilder : public ArrayBuilder { + public: + /// If any of field_builders has indeterminate type, this builder will also + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector> field_builders); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// Null bitmap is of equal length to every child field, and any zero byte + /// will be considered as a null for that field, but users must using app- + /// end methods or advance methods of the child builders' independently to + /// insert data. + Status AppendValues(int64_t length, const uint8_t* valid_bytes) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// Append an element to the Struct. All child-builders' Append method must + /// be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + /// \brief Append a null value. Automatically appends an empty value to each child + /// builder. + Status AppendNull() final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); + } + return Append(false); + } + + /// \brief Append multiple null values. Automatically appends empty values to each + /// child builder. + Status AppendNulls(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, false); + return Status::OK(); + } + + Status AppendEmptyValue() final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); + } + return Append(true); + } + + Status AppendEmptyValues(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + return Status::OK(); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + for (int i = 0; static_cast(i) < children_.size(); i++) { + ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i], + array.offset + offset, length)); + } + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(validity, array.offset + offset, length); + return Status::OK(); + } + + void Reset() override; + + ArrayBuilder* field_builder(int i) const { return children_[i].get(); } + + int num_fields() const { return static_cast(children_.size()); } + + std::shared_ptr type() const override; + + private: + std::shared_ptr type_; +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_primitive.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_primitive.h new file mode 100644 index 0000000000000000000000000000000000000000..de7af1b46bdee2f7cecb5978bf84950bfac9b274 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_primitive.h @@ -0,0 +1,556 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" + +namespace arrow { + +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* pool = default_memory_pool(), + int64_t ARROW_ARG_UNUSED(alignment) = kDefaultBufferAlignment) + : ArrayBuilder(pool) {} + + explicit NullBuilder(const std::shared_ptr& ARROW_ARG_UNUSED(type), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NullBuilder(pool, alignment) {} + + /// \brief Append the specified number of null elements + Status AppendNulls(int64_t length) final { + if (length < 0) return Status::Invalid("length must be positive"); + null_count_ += length; + length_ += length; + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { return AppendNulls(1); } + + Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); } + + Status AppendEmptyValue() final { return AppendEmptyValues(1); } + + Status Append(std::nullptr_t) { return AppendNull(); } + + Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override { + return AppendNulls(length); + } + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + std::shared_ptr type() const override { return null(); } + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +/// \addtogroup numeric-builders +/// +/// @{ + +/// Base class for all Builders that emit an Array of a scalar numerical type. +template +class NumericBuilder + : public ArrayBuilder, + public internal::ArrayBuilderExtraOps, typename T::c_type> { + public: + using TypeClass = T; + using value_type = typename T::c_type; + using ArrayType = typename TypeTraits::ArrayType; + + template + explicit NumericBuilder( + enable_if_parameter_free pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + type_(TypeTraits::type_singleton()), + data_builder_(pool, alignment) {} + + NumericBuilder(const std::shared_ptr& type, MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), type_(type), data_builder_(pool, alignment) {} + + /// Append a single scalar and increase the size if necessary. + Status Append(const value_type val) { + ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + /// The memory at the corresponding data slot is set to 0 to prevent + /// uninitialized memory access + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value_type{}); // zero + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(value_type{}); // zero + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(value_type{}); // zero + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value_type{}); // zero + UnsafeSetNotNull(length); + return Status::OK(); + } + + value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } + + void Reset() override { + data_builder_.Reset(); + ArrayBuilder::Reset(); + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + value_type operator[](int64_t index) const { return GetValue(index); } + + value_type& operator[](int64_t index) { + return reinterpret_cast(data_builder_.mutable_data())[index]; + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] bitmap a validity bitmap to copy (may be null) + /// \param[in] bitmap_offset an offset into the validity bitmap + /// \return Status + Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap, + int64_t bitmap_offset) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \return Status + Status AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); + } + + Status FinishInternal(std::shared_ptr* out) override { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, + null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_); + capacity_ = length_ = null_count_ = 0; + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + // this updates the length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values. + /// \return Status + template + enable_if_t::value, Status> AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + // Same as above, with a pointer type ValidIter + template + enable_if_t::value, Status> AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + } + + return Status::OK(); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + return AppendValues(array.GetValues(1) + offset, length, + array.GetValues(0, 0), array.offset + offset); + } + + /// Append a single scalar under the assumption that the underlying Buffer is + /// large enough. + /// + /// This method does not capacity-check; make sure to call Reserve + /// beforehand. + void UnsafeAppend(const value_type val) { + ArrayBuilder::UnsafeAppendToBitmap(true); + data_builder_.UnsafeAppend(val); + } + + void UnsafeAppendNull() { + ArrayBuilder::UnsafeAppendToBitmap(false); + data_builder_.UnsafeAppend(value_type{}); // zero + } + + std::shared_ptr type() const override { return type_; } + + protected: + std::shared_ptr type_; + TypedBufferBuilder data_builder_; +}; + +// Builders + +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; + +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; + +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; + +/// @} + +/// \addtogroup temporal-builders +/// +/// @{ + +using Date32Builder = NumericBuilder; +using Date64Builder = NumericBuilder; +using Time32Builder = NumericBuilder; +using Time64Builder = NumericBuilder; +using TimestampBuilder = NumericBuilder; +using MonthIntervalBuilder = NumericBuilder; +using DurationBuilder = NumericBuilder; + +/// @} + +class ARROW_EXPORT BooleanBuilder + : public ArrayBuilder, + public internal::ArrayBuilderExtraOps { + public: + using TypeClass = BooleanType; + using value_type = bool; + + explicit BooleanBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + BooleanBuilder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, false); + UnsafeSetNull(length); + return Status::OK(); + } + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNull(); + return Status::OK(); + } + + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(false); + UnsafeSetNotNull(1); + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, false); + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// Scalar append + Status Append(const bool val) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + Status Append(const uint8_t val) { return Append(val != 0); } + + /// Scalar append, without checking for capacity + void UnsafeAppend(const bool val) { + data_builder_.UnsafeAppend(val); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(false); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of bytes (non-zero is 1) + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a bitmap of values + /// \param[in] length the number of values to append + /// \param[in] validity a validity bitmap to copy (may be null) + /// \param[in] offset an offset into the values and validity bitmaps + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity, + int64_t offset); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// or null(0) values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + // this updates length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values + /// \return Status + template + enable_if_t::value, Status> AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + // Same as above, for a pointer type ValidIter + template + enable_if_t::value, Status> AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + } + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + Status AppendValues(int64_t length, bool value); + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + return AppendValues(array.GetValues(1, 0), length, + array.GetValues(0, 0), array.offset + offset); + } + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + void Reset() override; + Status Resize(int64_t capacity) override; + + std::shared_ptr type() const override { return boolean(); } + + protected: + TypedBufferBuilder data_builder_; +}; + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_run_end.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_run_end.h new file mode 100644 index 0000000000000000000000000000000000000000..ac92efbd0dbe6b470b8275219e75b41aa3f7ab3a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_run_end.h @@ -0,0 +1,303 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" + +namespace arrow { + +/// \addtogroup run-end-encoded-builders +/// +/// @{ + +namespace internal { + +/// \brief An ArrayBuilder that deduplicates repeated values as they are +/// appended to the inner-ArrayBuilder and reports the length of the current run +/// of identical values. +/// +/// The following sequence of calls +/// +/// Append(2) +/// Append(2) +/// Append(2) +/// Append(7) +/// Append(7) +/// Append(2) +/// FinishInternal() +/// +/// will cause the inner-builder to receive only 3 Append calls +/// +/// Append(2) +/// Append(7) +/// Append(2) +/// FinishInternal() +/// +/// Note that values returned by length(), null_count() and capacity() are +/// related to the compressed array built by the inner-ArrayBuilder. +class RunCompressorBuilder : public ArrayBuilder { + public: + RunCompressorBuilder(MemoryPool* pool, std::shared_ptr inner_builder, + std::shared_ptr type); + + ~RunCompressorBuilder() override; + + ARROW_DISALLOW_COPY_AND_ASSIGN(RunCompressorBuilder); + + /// \brief Called right before a run is being closed + /// + /// Subclasses can override this function to perform an additional action when + /// a run is closed (i.e. run-length is known and value is appended to the + /// inner builder). + /// + /// \param value can be NULLPTR if closing a run of NULLs + /// \param length the greater than 0 length of the value run being closed + virtual Status WillCloseRun(const std::shared_ptr& value, + int64_t length) { + return Status::OK(); + } + + /// \brief Called right before a run of empty values is being closed + /// + /// Subclasses can override this function to perform an additional action when + /// a run of empty values is appended (i.e. run-length is known and a single + /// empty value is appended to the inner builder). + /// + /// \param length the greater than 0 length of the value run being closed + virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); } + + /// \brief Allocate enough memory for a given number of array elements. + /// + /// NOTE: Conservatively resizing a run-length compressed array for a given + /// number of logical elements is not possible, since the physical length will + /// vary depending on the values to be appended in the future. But we can + /// pessimistically assume that each run will contain a single value and + /// allocate that number of runs. + Status Resize(int64_t capacity) override { return ResizePhysical(capacity); } + + /// \brief Allocate enough memory for a given number of runs. + /// + /// Like Resize on non-encoded builders, it does not account for variable size + /// data. + Status ResizePhysical(int64_t capacity); + + Status ReservePhysical(int64_t additional_capacity) { + return Reserve(additional_capacity); + } + + void Reset() override; + + Status AppendNull() final { return AppendNulls(1); } + Status AppendNulls(int64_t length) override; + + Status AppendEmptyValue() final { return AppendEmptyValues(1); } + Status AppendEmptyValues(int64_t length) override; + + Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override; + Status AppendScalars(const ScalarVector& scalars) override; + + // AppendArraySlice() is not implemented. + + /// \brief Append a slice of an array containing values from already + /// compressed runs. + /// + /// NOTE: WillCloseRun() is not called as the length of each run cannot be + /// determined at this point. Caller should ensure that !has_open_run() by + /// calling FinishCurrentRun() before calling this. + /// + /// Pre-condition: !has_open_run() + Status AppendRunCompressedArraySlice(const ArraySpan& array, int64_t offset, + int64_t length); + + /// \brief Forces the closing of the current run if one is currently open. + /// + /// This can be called when one wants to ensure the current run will not be + /// extended. This may cause identical values to appear close to each other in + /// the underlying array (i.e. two runs that could be a single run) if more + /// values are appended after this is called. + /// + /// Finish() and FinishInternal() call this automatically. + virtual Status FinishCurrentRun(); + + Status FinishInternal(std::shared_ptr* out) override; + + ArrayBuilder& inner_builder() const { return *inner_builder_; } + + std::shared_ptr type() const override { return inner_builder_->type(); } + + bool has_open_run() const { return current_run_length_ > 0; } + int64_t open_run_length() const { return current_run_length_; } + + private: + inline void UpdateDimensions() { + capacity_ = inner_builder_->capacity(); + length_ = inner_builder_->length(); + null_count_ = inner_builder_->null_count(); + } + + private: + std::shared_ptr inner_builder_; + std::shared_ptr current_value_ = NULLPTR; + int64_t current_run_length_ = 0; +}; + +} // namespace internal + +// ---------------------------------------------------------------------- +// RunEndEncoded builder + +/// \brief Run-end encoded array builder. +/// +/// NOTE: the value returned by and capacity() is related to the +/// compressed array (physical) and not the decoded array (logical) that is +/// run-end encoded. null_count() always returns 0. length(), on the other hand, +/// returns the logical length of the run-end encoded array. +class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder { + private: + // An internal::RunCompressorBuilder that produces a run-end in the + // RunEndEncodedBuilder every time a value-run is closed. + class ValueRunBuilder : public internal::RunCompressorBuilder { + public: + ValueRunBuilder(MemoryPool* pool, const std::shared_ptr& value_builder, + const std::shared_ptr& value_type, + RunEndEncodedBuilder& ree_builder); + + ~ValueRunBuilder() override = default; + + Status WillCloseRun(const std::shared_ptr&, int64_t length) override { + return ree_builder_.CloseRun(length); + } + + Status WillCloseRunOfEmptyValues(int64_t length) override { + return ree_builder_.CloseRun(length); + } + + private: + RunEndEncodedBuilder& ree_builder_; + }; + + public: + RunEndEncodedBuilder(MemoryPool* pool, + const std::shared_ptr& run_end_builder, + const std::shared_ptr& value_builder, + std::shared_ptr type); + + /// \brief Allocate enough memory for a given number of array elements. + /// + /// NOTE: Conservatively resizing an REE for a given number of logical + /// elements is not possible, since the physical length will vary depending on + /// the values to be appended in the future. But we can pessimistically assume + /// that each run will contain a single value and allocate that number of + /// runs. + Status Resize(int64_t capacity) override { return ResizePhysical(capacity); } + + /// \brief Allocate enough memory for a given number of runs. + Status ResizePhysical(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to append the indicated + /// number of run without any further reallocation. Overallocation is + /// used in order to minimize the impact of incremental ReservePhysical() calls. + /// Note that additional_capacity is relative to the current number of elements + /// rather than to the current capacity, so calls to Reserve() which are not + /// interspersed with addition of new elements may not increase the capacity. + /// + /// \param[in] additional_capacity the number of additional runs + /// \return Status + Status ReservePhysical(int64_t additional_capacity) { + return Reserve(additional_capacity); + } + + void Reset() override; + + Status AppendNull() final { return AppendNulls(1); } + Status AppendNulls(int64_t length) override; + + Status AppendEmptyValue() final { return AppendEmptyValues(1); } + Status AppendEmptyValues(int64_t length) override; + Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override; + Status AppendScalars(const ScalarVector& scalars) override; + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Forces the closing of the current run if one is currently open. + /// + /// This can be called when one wants to ensure the current run will not be + /// extended. This may cause identical values to appear close to each other in + /// the values array (i.e. two runs that could be a single run) if more + /// values are appended after this is called. + Status FinishCurrentRun(); + + std::shared_ptr type() const override; + + private: + /// \brief Update physical capacity and logical length + /// + /// \param committed_logical_length number of logical values that have been + /// committed to the values array + /// \param open_run_length number of logical values in the currently open run if any + inline void UpdateDimensions(int64_t committed_logical_length, + int64_t open_run_length) { + capacity_ = run_end_builder().capacity(); + length_ = committed_logical_length + open_run_length; + committed_logical_length_ = committed_logical_length; + } + + // Pre-condition: !value_run_builder_.has_open_run() + template + Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length); + + template + Status DoAppendRunEnd(int64_t run_end); + + /// \brief Cast run_end to the appropriate type and appends it to the run_ends + /// array. + Status AppendRunEnd(int64_t run_end); + + /// \brief Close a run by appending a value to the run_ends array and updating + /// length_ to reflect the new run. + /// + /// Pre-condition: run_length > 0. + [[nodiscard]] Status CloseRun(int64_t run_length); + + ArrayBuilder& run_end_builder(); + ArrayBuilder& value_builder(); + + private: + std::shared_ptr type_; + ValueRunBuilder* value_run_builder_; + // The length not counting the current open run in the value_run_builder_ + int64_t committed_logical_length_ = 0; +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_time.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_time.h new file mode 100644 index 0000000000000000000000000000000000000000..da29ae3124b5d3da32605503b29edf6920cdf6d6 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_time.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Contains declarations of time related Arrow builder types. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_primitive.h" + +namespace arrow { + +/// \addtogroup temporal-builders +/// +/// @{ + +// TODO(ARROW-7938): this class is untested + +class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder { + public: + using DayMilliseconds = DayTimeIntervalType::DayMilliseconds; + + explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : DayTimeIntervalBuilder(day_time_interval(), pool, alignment) {} + + explicit DayTimeIntervalBuilder(std::shared_ptr type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NumericBuilder(type, pool, alignment) {} +}; + +class ARROW_EXPORT MonthDayNanoIntervalBuilder + : public NumericBuilder { + public: + using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos; + + explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool, alignment) {} + + explicit MonthDayNanoIntervalBuilder(std::shared_ptr type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NumericBuilder(type, pool, alignment) {} +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_union.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_union.h new file mode 100644 index 0000000000000000000000000000000000000000..718ef4c32cebef1d30e4f7c036a7ab8f4b333e4a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_union.h @@ -0,0 +1,254 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/data.h" +#include "arrow/buffer_builder.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup nested-builders +/// +/// @{ + +/// \brief Base class for union array builds. +/// +/// Note that while we subclass ArrayBuilder, as union types do not have a +/// validity bitmap, the bitmap builder member of ArrayBuilder is not used. +class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { + public: + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Make a new child builder available to the UnionArray + /// + /// \param[in] new_child the child builder + /// \param[in] field_name the name of the field in the union array type + /// if type inference is used + /// \return child index, which is the "type" argument that needs + /// to be passed to the "Append" method to add a new element to + /// the union array. + int8_t AppendChild(const std::shared_ptr& new_child, + const std::string& field_name = ""); + + std::shared_ptr type() const override; + + int64_t length() const override { return types_builder_.length(); } + + protected: + BasicUnionBuilder(MemoryPool* pool, int64_t alignment, + const std::vector>& children, + const std::shared_ptr& type); + + int8_t NextTypeId(); + + std::vector> child_fields_; + std::vector type_codes_; + UnionMode::type mode_; + + std::vector type_id_to_children_; + std::vector type_id_to_child_id_; + // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr + int8_t dense_type_id_ = 0; + TypedBufferBuilder types_builder_; +}; + +/// \class DenseUnionBuilder +/// +/// This API is EXPERIMENTAL. +class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { + public: + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit DenseUnionBuilder(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, {}, dense_union(FieldVector{})), + offsets_builder_(pool, alignment) {} + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + DenseUnionBuilder(MemoryPool* pool, + const std::vector>& children, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, children, type), + offsets_builder_(pool, alignment) {} + + Status AppendNull() final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(static_cast(child_builder->length()))); + // Append a null arbitrarily to the first child + return child_builder->AppendNull(); + } + + Status AppendNulls(int64_t length) final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(length, static_cast(child_builder->length()))); + // Append just a single null to the first child + return child_builder->AppendNull(); + } + + Status AppendEmptyValue() final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(static_cast(child_builder->length()))); + // Append an empty value arbitrarily to the first child + return child_builder->AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(length, static_cast(child_builder->length()))); + // Append just a single empty value to the first child + return child_builder->AppendEmptyValue(); + } + + /// \brief Append an element to the UnionArray. This must be followed + /// by an append to the appropriate child builder. + /// + /// \param[in] next_type type_id of the child to which the next value will be appended. + /// + /// The corresponding child builder must be appended to independently after this method + /// is called. + Status Append(int8_t next_type) { + ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); + if (type_id_to_children_[next_type]->length() == kListMaximumElements) { + return Status::CapacityError( + "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single " + "child"); + } + auto offset = static_cast(type_id_to_children_[next_type]->length()); + return offsets_builder_.Append(offset); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + Status FinishInternal(std::shared_ptr* out) override; + + private: + TypedBufferBuilder offsets_builder_; +}; + +/// \class SparseUnionBuilder +/// +/// This API is EXPERIMENTAL. +class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { + public: + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit SparseUnionBuilder(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, {}, sparse_union(FieldVector{})) {} + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + SparseUnionBuilder(MemoryPool* pool, + const std::vector>& children, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, children, type) {} + + /// \brief Append a null value. + /// + /// A null is appended to the first child, empty values to the other children. + Status AppendNull() final { + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull()); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue()); + } + return Status::OK(); + } + + /// \brief Append multiple null values. + /// + /// Nulls are appended to the first child, empty values to the other children. + Status AppendNulls(int64_t length) final { + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length)); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK( + type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length)); + } + return Status::OK(); + } + + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0])); + for (int8_t code : type_codes_) { + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue()); + } + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); + for (int8_t code : type_codes_) { + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length)); + } + return Status::OK(); + } + + /// \brief Append an element to the UnionArray. This must be followed + /// by an append to the appropriate child builder. + /// + /// \param[in] next_type type_id of the child to which the next value will be appended. + /// + /// The corresponding child builder must be appended to independently after this method + /// is called, and all other child builders must have null or empty value appended. + Status Append(int8_t next_type) { return types_builder_.Append(next_type); } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; +}; + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/concatenate.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/concatenate.h new file mode 100644 index 0000000000000000000000000000000000000000..aada5624d63a3052edddf0182799c474bee0c528 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/concatenate.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \param[out] out_suggested_cast if a non-OK Result is returned, the function might set +/// out_suggested_cast to a cast suggestion that would allow concatenating the arrays +/// without overflow of offsets (e.g. string to large_string) +/// +/// \return the concatenated array +ARROW_EXPORT +Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool, + std::shared_ptr* out_suggested_cast); + +} // namespace internal + +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \return the concatenated array +ARROW_EXPORT +Result> Concatenate(const ArrayVector& arrays, + MemoryPool* pool = default_memory_pool()); + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/data.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/data.h new file mode 100644 index 0000000000000000000000000000000000000000..1e6ee9a1d32ff25e0530e6b89ee321cb9a438119 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/data.h @@ -0,0 +1,676 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include // IWYU pragma: export +#include +#include +#include +#include +#include + +#include "arrow/array/statistics.h" +#include "arrow/buffer.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/macros.h" +#include "arrow/util/span.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace internal { +// ---------------------------------------------------------------------- +// Null handling for types without a validity bitmap and the dictionary type + +ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i); +ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i); +ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i); + +ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); +ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); +ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data); + +} // namespace internal + +// When slicing, we do not know the null count of the sliced range without +// doing some computation. To avoid doing this eagerly, we set the null count +// to -1 (any negative number will do). When Array::null_count is called the +// first time, the null count will be computed. See ARROW-33 +constexpr int64_t kUnknownNullCount = -1; + +// ---------------------------------------------------------------------- +// Generic array data container + +/// \class ArrayData +/// \brief Mutable container for generic Arrow array data +/// +/// This data structure is a self-contained representation of the memory and +/// metadata inside an Arrow array data structure (called vectors in Java). The +/// classes arrow::Array and its subclasses provide strongly-typed accessors +/// with support for the visitor pattern and other affordances. +/// +/// This class is designed for easy internal data manipulation, analytical data +/// processing, and data transport to and from IPC messages. For example, we +/// could cast from int64 to float64 like so: +/// +/// Int64Array arr = GetMyData(); +/// auto new_data = arr.data()->Copy(); +/// new_data->type = arrow::float64(); +/// DoubleArray double_arr(new_data); +/// +/// This object is also useful in an analytics setting where memory may be +/// reused. For example, if we had a group of operations all returning doubles, +/// say: +/// +/// Log(Sqrt(Expr(arr))) +/// +/// Then the low-level implementations of each of these functions could have +/// the signatures +/// +/// void Log(const ArrayData& values, ArrayData* out); +/// +/// As another example a function may consume one or more memory buffers in an +/// input array and replace them with newly-allocated data, changing the output +/// data type as well. +struct ARROW_EXPORT ArrayData { + ArrayData() = default; + + ArrayData(std::shared_ptr type, int64_t length, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : type(std::move(type)), length(length), null_count(null_count), offset(offset) {} + + ArrayData(std::shared_ptr type, int64_t length, + std::vector> buffers, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : ArrayData(std::move(type), length, null_count, offset) { + this->buffers = std::move(buffers); +#ifndef NDEBUG + // in debug mode, call the `device_type` function to trigger + // the DCHECKs that validate all the buffers are on the same device + ARROW_UNUSED(this->device_type()); +#endif + } + + ArrayData(std::shared_ptr type, int64_t length, + std::vector> buffers, + std::vector> child_data, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : ArrayData(std::move(type), length, null_count, offset) { + this->buffers = std::move(buffers); + this->child_data = std::move(child_data); +#ifndef NDEBUG + // in debug mode, call the `device_type` function to trigger + // the DCHECKs that validate all the buffers (including children) + // are on the same device + ARROW_UNUSED(this->device_type()); +#endif + } + + static std::shared_ptr Make(std::shared_ptr type, int64_t length, + std::vector> buffers, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + static std::shared_ptr Make( + std::shared_ptr type, int64_t length, + std::vector> buffers, + std::vector> child_data, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + static std::shared_ptr Make( + std::shared_ptr type, int64_t length, + std::vector> buffers, + std::vector> child_data, + std::shared_ptr dictionary, int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + static std::shared_ptr Make(std::shared_ptr type, int64_t length, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + // Move constructor + ArrayData(ArrayData&& other) noexcept + : type(std::move(other.type)), + length(other.length), + offset(other.offset), + buffers(std::move(other.buffers)), + child_data(std::move(other.child_data)), + dictionary(std::move(other.dictionary)), + statistics(std::move(other.statistics)) { + SetNullCount(other.null_count); + } + + // Copy constructor + ArrayData(const ArrayData& other) noexcept + : type(other.type), + length(other.length), + offset(other.offset), + buffers(other.buffers), + child_data(other.child_data), + dictionary(other.dictionary), + statistics(other.statistics) { + SetNullCount(other.null_count); + } + + // Move assignment + ArrayData& operator=(ArrayData&& other) { + type = std::move(other.type); + length = other.length; + SetNullCount(other.null_count); + offset = other.offset; + buffers = std::move(other.buffers); + child_data = std::move(other.child_data); + dictionary = std::move(other.dictionary); + statistics = std::move(other.statistics); + return *this; + } + + // Copy assignment + ArrayData& operator=(const ArrayData& other) { + type = other.type; + length = other.length; + SetNullCount(other.null_count); + offset = other.offset; + buffers = other.buffers; + child_data = other.child_data; + dictionary = other.dictionary; + statistics = other.statistics; + return *this; + } + + std::shared_ptr Copy() const { return std::make_shared(*this); } + + /// \brief Copy all buffers and children recursively to destination MemoryManager + /// + /// This utilizes MemoryManager::CopyBuffer to create a new ArrayData object + /// recursively copying the buffers and all child buffers to the destination + /// memory manager. This includes dictionaries if applicable. + Result> CopyTo( + const std::shared_ptr& to) const; + /// \brief View or Copy this ArrayData to destination memory manager. + /// + /// Tries to view the buffer contents on the given memory manager's device + /// if possible (to avoid a copy) but falls back to copying if a no-copy view + /// isn't supported. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + + bool IsNull(int64_t i) const { return !IsValid(i); } + + bool IsValid(int64_t i) const { + if (buffers[0] != NULLPTR) { + return bit_util::GetBit(buffers[0]->data(), i + offset); + } + const auto type = this->type->id(); + if (type == Type::SPARSE_UNION) { + return !internal::IsNullSparseUnion(*this, i); + } + if (type == Type::DENSE_UNION) { + return !internal::IsNullDenseUnion(*this, i); + } + if (type == Type::RUN_END_ENCODED) { + return !internal::IsNullRunEndEncoded(*this, i); + } + return null_count.load() != length; + } + + // Access a buffer's data as a typed C pointer + template + inline const T* GetValues(int i, int64_t absolute_offset) const { + if (buffers[i]) { + return reinterpret_cast(buffers[i]->data()) + absolute_offset; + } else { + return NULLPTR; + } + } + + template + inline const T* GetValues(int i) const { + return GetValues(i, offset); + } + + // Like GetValues, but returns NULLPTR instead of aborting if the underlying + // buffer is not a CPU buffer. + template + inline const T* GetValuesSafe(int i, int64_t absolute_offset) const { + if (buffers[i] && buffers[i]->is_cpu()) { + return reinterpret_cast(buffers[i]->data()) + absolute_offset; + } else { + return NULLPTR; + } + } + + template + inline const T* GetValuesSafe(int i) const { + return GetValuesSafe(i, offset); + } + + // Access a buffer's data as a typed C pointer + template + inline T* GetMutableValues(int i, int64_t absolute_offset) { + if (buffers[i]) { + return reinterpret_cast(buffers[i]->mutable_data()) + absolute_offset; + } else { + return NULLPTR; + } + } + + template + inline T* GetMutableValues(int i) { + return GetMutableValues(i, offset); + } + + /// \brief Construct a zero-copy slice of the data with the given offset and length + /// + /// The associated `ArrayStatistics` is always discarded in a sliced + /// `ArrayData`. Because `ArrayStatistics` in the original + /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want + /// to reuse statistics in the original `ArrayData`, you need to do + /// it by yourself. + /// + /// If the specified slice range has the same range as the original + /// `ArrayData`, we can reuse statistics in the original + /// `ArrayData`. Because it has the same data as the original + /// `ArrayData`. But the associated `ArrayStatistics` is discarded + /// in this case too. Use `Copy()` instead for the case. + std::shared_ptr Slice(int64_t offset, int64_t length) const; + + /// \brief Input-checking variant of Slice + /// + /// An Invalid Status is returned if the requested slice falls out of bounds. + /// Note that unlike Slice, `length` isn't clamped to the available buffer size. + Result> SliceSafe(int64_t offset, int64_t length) const; + + void SetNullCount(int64_t v) { null_count.store(v); } + + /// \brief Return physical null count, or compute and set it if it's not known + int64_t GetNullCount() const; + + /// \brief Return true if the data has a validity bitmap and the physical null + /// count is known to be non-zero or not yet known. + /// + /// Note that this is not the same as MayHaveLogicalNulls, which also checks + /// for the presence of nulls in child data for types like unions and run-end + /// encoded types. + /// + /// \see HasValidityBitmap + /// \see MayHaveLogicalNulls + bool MayHaveNulls() const { + // If an ArrayData is slightly malformed it may have kUnknownNullCount set + // but no buffer + return null_count.load() != 0 && buffers[0] != NULLPTR; + } + + /// \brief Return true if the data has a validity bitmap + bool HasValidityBitmap() const { return buffers[0] != NULLPTR; } + + /// \brief Return true if the validity bitmap may have 0's in it, or if the + /// child arrays (in the case of types without a validity bitmap) may have + /// nulls, or if the dictionary of dictionay array may have nulls. + /// + /// This is not a drop-in replacement for MayHaveNulls, as historically + /// MayHaveNulls() has been used to check for the presence of a validity + /// bitmap that needs to be checked. + /// + /// Code that previously used MayHaveNulls() and then dealt with the validity + /// bitmap directly can be fixed to handle all types correctly without + /// performance degradation when handling most types by adopting + /// HasValidityBitmap and MayHaveLogicalNulls. + /// + /// Before: + /// + /// uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + /// for (int64_t i = 0; i < array.length; ++i) { + /// if (validity && !bit_util::GetBit(validity, i)) { + /// continue; // skip a NULL + /// } + /// ... + /// } + /// + /// After: + /// + /// bool all_valid = !array.MayHaveLogicalNulls(); + /// uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + /// for (int64_t i = 0; i < array.length; ++i) { + /// bool is_valid = all_valid || + /// (validity && bit_util::GetBit(validity, i)) || + /// array.IsValid(i); + /// if (!is_valid) { + /// continue; // skip a NULL + /// } + /// ... + /// } + bool MayHaveLogicalNulls() const { + if (buffers[0] != NULLPTR) { + return null_count.load() != 0; + } + const auto t = type->id(); + if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) { + return internal::UnionMayHaveLogicalNulls(*this); + } + if (t == Type::RUN_END_ENCODED) { + return internal::RunEndEncodedMayHaveLogicalNulls(*this); + } + if (t == Type::DICTIONARY) { + return internal::DictionaryMayHaveLogicalNulls(*this); + } + return null_count.load() != 0; + } + + /// \brief Computes the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// GetNullCount. For types that have no validity bitmap, this function will + /// recompute the null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + + /// \brief Return the device_type of the underlying buffers and children + /// + /// If there are no buffers in this ArrayData object, it just returns + /// DeviceAllocationType::kCPU as a default. We also assume that all buffers + /// should be allocated on the same device type and perform DCHECKs to confirm + /// this in debug mode. + /// + /// \return DeviceAllocationType + DeviceAllocationType device_type() const; + + std::shared_ptr type; + int64_t length = 0; + mutable std::atomic null_count{0}; + // The logical start point into the physical buffers (in values, not bytes). + // Note that, for child data, this must be *added* to the child data's own offset. + int64_t offset = 0; + std::vector> buffers; + std::vector> child_data; + + // The dictionary for this Array, if any. Only used for dictionary type + std::shared_ptr dictionary; + + // The statistics for this Array. + std::shared_ptr statistics; +}; + +/// \brief A non-owning Buffer reference +struct ARROW_EXPORT BufferSpan { + // It is the user of this class's responsibility to ensure that + // buffers that were const originally are not written to + // accidentally. + uint8_t* data = NULLPTR; + int64_t size = 0; + // Pointer back to buffer that owns this memory + const std::shared_ptr* owner = NULLPTR; + + template + const T* data_as() const { + return reinterpret_cast(data); + } + template + T* mutable_data_as() { + return reinterpret_cast(data); + } +}; + +/// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply +/// copyable and does not contain any shared_ptr objects. Do not use in public +/// APIs aside from compute kernels for now +struct ARROW_EXPORT ArraySpan { + const DataType* type = NULLPTR; + int64_t length = 0; + mutable int64_t null_count = kUnknownNullCount; + int64_t offset = 0; + BufferSpan buffers[3]; + + ArraySpan() = default; + + explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {} + + ArraySpan(const ArrayData& data) { // NOLINT implicit conversion + SetMembers(data); + } + explicit ArraySpan(const Scalar& data) { FillFromScalar(data); } + + /// If dictionary-encoded, put dictionary in the first entry + std::vector child_data; + + /// \brief Populate ArraySpan to look like an array of length 1 pointing at + /// the data members of a Scalar value + void FillFromScalar(const Scalar& value); + + void SetMembers(const ArrayData& data); + + void SetBuffer(int index, const std::shared_ptr& buffer) { + this->buffers[index].data = const_cast(buffer->data()); + this->buffers[index].size = buffer->size(); + this->buffers[index].owner = &buffer; + } + + const ArraySpan& dictionary() const { return child_data[0]; } + + /// \brief Return the number of buffers (out of 3) that are used to + /// constitute this array + int num_buffers() const; + + // Access a buffer's data as a typed C pointer + template + inline T* GetValues(int i, int64_t absolute_offset) { + return reinterpret_cast(buffers[i].data) + absolute_offset; + } + + template + inline T* GetValues(int i) { + return GetValues(i, this->offset); + } + + // Access a buffer's data as a typed C pointer + template + inline const T* GetValues(int i, int64_t absolute_offset) const { + return reinterpret_cast(buffers[i].data) + absolute_offset; + } + + template + inline const T* GetValues(int i) const { + return GetValues(i, this->offset); + } + + /// \brief Access a buffer's data as a span + /// + /// \param i The buffer index + /// \param length The required length (in number of typed values) of the requested span + /// \pre i > 0 + /// \pre length <= the length of the buffer (in number of values) that's expected for + /// this array type + /// \return A span of the requested length + template + util::span GetSpan(int i, int64_t length) const { + const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); + assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); + return util::span(buffers[i].data_as() + this->offset, length); + } + + /// \brief Access a buffer's data as a span + /// + /// \param i The buffer index + /// \param length The required length (in number of typed values) of the requested span + /// \pre i > 0 + /// \pre length <= the length of the buffer (in number of values) that's expected for + /// this array type + /// \return A span of the requested length + template + util::span GetSpan(int i, int64_t length) { + const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); + assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); + return util::span(buffers[i].mutable_data_as() + this->offset, length); + } + + inline bool IsNull(int64_t i) const { return !IsValid(i); } + + inline bool IsValid(int64_t i) const { + if (this->buffers[0].data != NULLPTR) { + return bit_util::GetBit(this->buffers[0].data, i + this->offset); + } else { + const auto type = this->type->id(); + if (type == Type::SPARSE_UNION) { + return !IsNullSparseUnion(i); + } + if (type == Type::DENSE_UNION) { + return !IsNullDenseUnion(i); + } + if (type == Type::RUN_END_ENCODED) { + return !IsNullRunEndEncoded(i); + } + return this->null_count != this->length; + } + } + + std::shared_ptr ToArrayData() const; + + std::shared_ptr ToArray() const; + + std::shared_ptr GetBuffer(int index) const { + const BufferSpan& buf = this->buffers[index]; + if (buf.owner) { + return *buf.owner; + } else if (buf.data != NULLPTR) { + // Buffer points to some memory without an owning buffer + return std::make_shared(buf.data, buf.size); + } else { + return NULLPTR; + } + } + + void SetSlice(int64_t offset, int64_t length) { + this->offset = offset; + this->length = length; + if (this->type->id() == Type::NA) { + this->null_count = this->length; + } else if (this->MayHaveNulls()) { + this->null_count = kUnknownNullCount; + } else { + this->null_count = 0; + } + } + + /// \brief Return physical null count, or compute and set it if it's not known + int64_t GetNullCount() const; + + /// \brief Return true if the array has a validity bitmap and the physical null + /// count is known to be non-zero or not yet known + /// + /// Note that this is not the same as MayHaveLogicalNulls, which also checks + /// for the presence of nulls in child data for types like unions and run-end + /// encoded types. + /// + /// \see HasValidityBitmap + /// \see MayHaveLogicalNulls + bool MayHaveNulls() const { + // If an ArrayData is slightly malformed it may have kUnknownNullCount set + // but no buffer + return null_count != 0 && buffers[0].data != NULLPTR; + } + + /// \brief Return true if the array has a validity bitmap + bool HasValidityBitmap() const { return buffers[0].data != NULLPTR; } + + /// \brief Return true if the validity bitmap may have 0's in it, or if the + /// child arrays (in the case of types without a validity bitmap) may have + /// nulls, or if the dictionary of dictionay array may have nulls. + /// + /// \see ArrayData::MayHaveLogicalNulls + bool MayHaveLogicalNulls() const { + if (buffers[0].data != NULLPTR) { + return null_count != 0; + } + const auto t = type->id(); + if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) { + return UnionMayHaveLogicalNulls(); + } + if (t == Type::RUN_END_ENCODED) { + return RunEndEncodedMayHaveLogicalNulls(); + } + if (t == Type::DICTIONARY) { + return DictionaryMayHaveLogicalNulls(); + } + return null_count != 0; + } + + /// \brief Compute the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// GetNullCount. For types that have no validity bitmap, this function will + /// recompute the logical null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + + /// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic + /// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into + /// buffers[2]; IE buffers[2].data points to the first shared_ptr of the + /// variadic set and buffers[2].size is the number of variadic buffers times + /// sizeof(shared_ptr). + /// + /// \see HasVariadicBuffers + util::span> GetVariadicBuffers() const; + bool HasVariadicBuffers() const; + + private: + ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& span, + int64_t i); + + bool IsNullSparseUnion(int64_t i) const; + bool IsNullDenseUnion(int64_t i) const; + + /// \brief Return true if the value at logical index i is null + /// + /// This function uses binary-search, so it has a O(log N) cost. + /// Iterating over the whole array and calling IsNull is O(N log N), so + /// for better performance it is recommended to use a + /// ree_util::RunEndEncodedArraySpan to iterate run by run instead. + bool IsNullRunEndEncoded(int64_t i) const; + + bool UnionMayHaveLogicalNulls() const; + bool RunEndEncodedMayHaveLogicalNulls() const; + bool DictionaryMayHaveLogicalNulls() const; +}; + +namespace internal { + +void FillZeroLengthArray(const DataType* type, ArraySpan* span); + +/// Construct a zero-copy view of this ArrayData with the given type. +/// +/// This method checks if the types are layout-compatible. +/// Nested types are traversed in depth-first order. Data buffers must have +/// the same item sizes, even though the logical types may be different. +/// An error is returned if the types are not layout-compatible. +ARROW_EXPORT +Result> GetArrayView(const std::shared_ptr& data, + const std::shared_ptr& type); + +} // namespace internal +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/diff.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/diff.h new file mode 100644 index 0000000000000000000000000000000000000000..a405164b333f3b21a17e8414ef59a8a628c28579 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/diff.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_nested.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Compare two arrays, returning an edit script which expresses the difference +/// between them +/// +/// An edit script is an array of struct(insert: bool, run_length: int64_t). +/// Each element of "insert" determines whether an element was inserted into (true) +/// or deleted from (false) base. Each insertion or deletion is followed by a run of +/// elements which are unchanged from base to target; the length of this run is stored +/// in "run_length". (Note that the edit script begins and ends with a run of shared +/// elements but both fields of the struct must have the same length. To accommodate this +/// the first element of "insert" should be ignored.) +/// +/// For example for base "hlloo" and target "hello", the edit script would be +/// [ +/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h") +/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo") +/// {"insert": false, "run_length": 0} // delete("o") then an empty run +/// ] +/// +/// Diffing arrays containing nulls is not currently supported. +/// +/// \param[in] base baseline for comparison +/// \param[in] target an array of identical type to base whose elements differ from base's +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \return an edit script array which can be applied to base to produce target +ARROW_EXPORT +Result> Diff(const Array& base, const Array& target, + MemoryPool* pool = default_memory_pool()); + +/// \brief visitor interface for easy traversal of an edit script +/// +/// visitor will be called for each hunk of insertions and deletions. +ARROW_EXPORT Status VisitEditScript( + const Array& edits, + const std::function& visitor); + +/// \brief return a function which will format an edit script in unified +/// diff format to os, given base and target arrays of type +ARROW_EXPORT Result< + std::function> +MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os); + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/statistics.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/statistics.h new file mode 100644 index 0000000000000000000000000000000000000000..523f877bbe429c39bd9b6265a58c5c313abaeb42 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/statistics.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Statistics for an Array +/// +/// Apache Arrow format doesn't have statistics but data source such +/// as Apache Parquet may have statistics. Statistics associated with +/// data source can be read unified API via this class. +struct ARROW_EXPORT ArrayStatistics { + using ValueType = std::variant; + + /// \brief The number of null values, may not be set + std::optional null_count = std::nullopt; + + /// \brief The number of distinct values, may not be set + std::optional distinct_count = std::nullopt; + + /// \brief The minimum value, may not be set + std::optional min = std::nullopt; + + /// \brief Whether the minimum value is exact or not + bool is_min_exact = false; + + /// \brief The maximum value, may not be set + std::optional max = std::nullopt; + + /// \brief Whether the maximum value is exact or not + bool is_max_exact = false; + + /// \brief Check two statistics for equality + bool Equals(const ArrayStatistics& other) const { + return null_count == other.null_count && distinct_count == other.distinct_count && + min == other.min && is_min_exact == other.is_min_exact && max == other.max && + is_max_exact == other.is_max_exact; + } + + /// \brief Check two statistics for equality + bool operator==(const ArrayStatistics& other) const { return Equals(other); } + + /// \brief Check two statistics for not equality + bool operator!=(const ArrayStatistics& other) const { return !Equals(other); } +}; + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/util.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/util.h new file mode 100644 index 0000000000000000000000000000000000000000..fd8e75ddb86405c523a8083f559dab0e72364e24 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/util.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/compare.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \defgroup array-factories Array factory functions +/// +/// @{ + +/// \brief Create a strongly-typed Array instance from generic ArrayData +/// \param[in] data the array contents +/// \return the resulting Array instance +ARROW_EXPORT +std::shared_ptr MakeArray(const std::shared_ptr& data); + +/// \brief Create a strongly-typed Array instance with all elements null +/// \param[in] type the array type +/// \param[in] length the array length +/// \param[in] pool the memory pool to allocate memory from +ARROW_EXPORT +Result> MakeArrayOfNull(const std::shared_ptr& type, + int64_t length, + MemoryPool* pool = default_memory_pool()); + +/// \brief Create an Array instance whose slots are the given scalar +/// \param[in] scalar the value with which to fill the array +/// \param[in] length the array length +/// \param[in] pool the memory pool to allocate memory from +ARROW_EXPORT +Result> MakeArrayFromScalar( + const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool()); + +/// \brief Create an empty Array of a given type +/// +/// The output Array will be of the given type. +/// +/// \param[in] type the data type of the empty Array +/// \param[in] pool the memory pool to allocate memory from +/// \return the resulting Array +ARROW_EXPORT +Result> MakeEmptyArray(std::shared_ptr type, + MemoryPool* pool = default_memory_pool()); + +/// @} + +namespace internal { + +/// \brief Swap endian of each element in a generic ArrayData +/// +/// As dictionaries are often shared between different arrays, dictionaries +/// are not swapped by this function and should be handled separately. +/// +/// \param[in] data the array contents +/// \param[in] pool the memory pool to allocate memory from +/// \return the resulting ArrayData whose elements were swapped +ARROW_EXPORT +Result> SwapEndianArrayData( + const std::shared_ptr& data, MemoryPool* pool = default_memory_pool()); + +/// Given a number of ArrayVectors, treat each ArrayVector as the +/// chunks of a chunked array. Then rechunk each ArrayVector such that +/// all ArrayVectors are chunked identically. It is mandatory that +/// all ArrayVectors contain the same total number of elements. +ARROW_EXPORT +std::vector RechunkArraysConsistently(const std::vector&); + +} // namespace internal +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/validate.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/validate.h new file mode 100644 index 0000000000000000000000000000000000000000..3ebfa0a51edce21ca585862b1dbb074b6cf8d9c8 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/array/validate.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +// Internal functions implementing Array::Validate() and friends. + +// O(1) array metadata validation + +ARROW_EXPORT +Status ValidateArray(const Array& array); + +ARROW_EXPORT +Status ValidateArray(const ArrayData& data); + +// O(N) array data validation. +// Note that, starting from 7.0.0, "full" routines also validate metadata. +// Before, ValidateArray() needed to be called before ValidateArrayFull() +// to ensure metadata correctness, otherwise invalid memory accesses +// may occur. + +ARROW_EXPORT +Status ValidateArrayFull(const Array& array); + +ARROW_EXPORT +Status ValidateArrayFull(const ArrayData& data); + +ARROW_EXPORT +Status ValidateUTF8(const Array& array); + +ARROW_EXPORT +Status ValidateUTF8(const ArrayData& data); + +} // namespace internal +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/buffer.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/buffer.h new file mode 100644 index 0000000000000000000000000000000000000000..fbf4a22e350cac7f6cffa766d96fe149ddb996db --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/buffer.h @@ -0,0 +1,587 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/device.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/span.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Buffer classes + +/// \class Buffer +/// \brief Object containing a pointer to a piece of contiguous memory with a +/// particular size. +/// +/// Buffers have two related notions of length: size and capacity. Size is +/// the number of bytes that might have valid data. Capacity is the number +/// of bytes that were allocated for the buffer in total. +/// +/// The Buffer base class does not own its memory, but subclasses often do. +/// +/// The following invariant is always true: Size <= Capacity +class ARROW_EXPORT Buffer { + public: + ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); + + /// \brief Construct from buffer and size without copying memory + /// + /// \param[in] data a memory buffer + /// \param[in] size buffer size + /// + /// \note The passed memory must be kept alive through some other means + Buffer(const uint8_t* data, int64_t size) + : is_mutable_(false), + is_cpu_(true), + data_(data), + size_(size), + capacity_(size), + device_type_(DeviceAllocationType::kCPU) { + SetMemoryManager(default_cpu_memory_manager()); + } + + Buffer(const uint8_t* data, int64_t size, std::shared_ptr mm, + std::shared_ptr parent = NULLPTR, + std::optional device_type_override = std::nullopt) + : is_mutable_(false), + data_(data), + size_(size), + capacity_(size), + parent_(std::move(parent)) { + // SetMemoryManager will also set device_type_ + SetMemoryManager(std::move(mm)); + // If a device type is specified, use that instead. Example of when this can be + // useful: the CudaMemoryManager can set device_type_ to kCUDA, but you can specify + // device_type_override=kCUDA_HOST as the device type to override it. + if (device_type_override != std::nullopt) { + device_type_ = *device_type_override; + } + } + + Buffer(uintptr_t address, int64_t size, std::shared_ptr mm, + std::shared_ptr parent = NULLPTR) + : Buffer(reinterpret_cast(address), size, std::move(mm), + std::move(parent)) {} + + /// \brief Construct from string_view without copying memory + /// + /// \param[in] data a string_view object + /// + /// \note The memory viewed by data must not be deallocated in the lifetime of the + /// Buffer; temporary rvalue strings must be stored in an lvalue somewhere + explicit Buffer(std::string_view data) + : Buffer(reinterpret_cast(data.data()), + static_cast(data.size())) {} + + virtual ~Buffer() = default; + + /// An offset into data that is owned by another buffer, but we want to be + /// able to retain a valid pointer to it even after other shared_ptr's to the + /// parent buffer have been destroyed + /// + /// This method makes no assertions about alignment or padding of the buffer but + /// in general we expected buffers to be aligned and padded to 64 bytes. In the future + /// we might add utility methods to help determine if a buffer satisfies this contract. + Buffer(const std::shared_ptr& parent, const int64_t offset, const int64_t size) + : Buffer(parent->data_ + offset, size) { + parent_ = parent; + SetMemoryManager(parent->memory_manager_); + } + + uint8_t operator[](std::size_t i) const { return data_[i]; } + + /// \brief Construct a new std::string with a hexadecimal representation of the buffer. + /// \return std::string + std::string ToHexString(); + + /// Return true if both buffers are the same size and contain the same bytes + /// up to the number of compared bytes + bool Equals(const Buffer& other, int64_t nbytes) const; + + /// Return true if both buffers are the same size and contain the same bytes + bool Equals(const Buffer& other) const; + + /// Copy a section of the buffer into a new Buffer. + Result> CopySlice( + const int64_t start, const int64_t nbytes, + MemoryPool* pool = default_memory_pool()) const; + + /// Zero bytes in padding, i.e. bytes between size_ and capacity_. + void ZeroPadding() { +#ifndef NDEBUG + CheckMutable(); +#endif + // A zero-capacity buffer can have a null data pointer + if (capacity_ != 0) { + memset(mutable_data() + size_, 0, static_cast(capacity_ - size_)); + } + } + + /// \brief Construct an immutable buffer that takes ownership of the contents + /// of an std::string (without copying it). + /// + /// \param[in] data a string to own + /// \return a new Buffer instance + static std::shared_ptr FromString(std::string data); + + /// \brief Construct an immutable buffer that takes ownership of the contents + /// of an std::vector (without copying it). Only vectors of TrivialType objects + /// (integers, floating point numbers, ...) can be wrapped by this function. + /// + /// \param[in] vec a vector to own + /// \return a new Buffer instance + template + static std::shared_ptr FromVector(std::vector vec) { + static_assert(std::is_trivial_v, + "Buffer::FromVector can only wrap vectors of trivial objects"); + + if (vec.empty()) { + return std::shared_ptr{new Buffer()}; + } + + auto* data = reinterpret_cast(vec.data()); + auto size_in_bytes = static_cast(vec.size() * sizeof(T)); + return std::shared_ptr{ + new Buffer{data, size_in_bytes}, + // Keep the vector's buffer alive inside the shared_ptr's destructor until after + // we have deleted the Buffer. Note we can't use this trick in FromString since + // std::string's data is inline for short strings so moving invalidates pointers + // into the string's buffer. + [vec = std::move(vec)](Buffer* buffer) { delete buffer; }}; + } + + /// \brief Create buffer referencing typed memory with some length without + /// copying + /// \param[in] data the typed memory as C array + /// \param[in] length the number of values in the array + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(const T* data, SizeType length) { + return std::make_shared(reinterpret_cast(data), + static_cast(sizeof(T) * length)); + } + + /// \brief Create buffer referencing std::vector with some length without + /// copying + /// \param[in] data the vector to be referenced. If this vector is changed, + /// the buffer may become invalid + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(const std::vector& data) { + return std::make_shared(reinterpret_cast(data.data()), + static_cast(sizeof(T) * data.size())); + } + + /// \brief Copy buffer contents into a new std::string + /// \return std::string + /// \note Can throw std::bad_alloc if buffer is large + std::string ToString() const; + + /// \brief View buffer contents as a std::string_view + /// \return std::string_view + explicit operator std::string_view() const { + return {reinterpret_cast(data_), static_cast(size_)}; + } + + /// \brief Return a pointer to the buffer's data + /// + /// The buffer has to be a CPU buffer (`is_cpu()` is true). + /// Otherwise, an assertion may be thrown or a null pointer may be returned. + /// + /// To get the buffer's data address regardless of its device, call `address()`. + const uint8_t* data() const { +#ifndef NDEBUG + CheckCPU(); +#endif + return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR; + } + + /// \brief Return a pointer to the buffer's data cast to a specific type + /// + /// The buffer has to be a CPU buffer (`is_cpu()` is true). + /// Otherwise, an assertion may be thrown or a null pointer may be returned. + template + const T* data_as() const { + return reinterpret_cast(data()); + } + + /// \brief Return the buffer's data as a span + template + util::span span_as() const { + return util::span(data_as(), static_cast(size() / sizeof(T))); + } + + /// \brief Return a writable pointer to the buffer's data + /// + /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()` + /// are true). Otherwise, an assertion may be thrown or a null pointer may + /// be returned. + /// + /// To get the buffer's mutable data address regardless of its device, call + /// `mutable_address()`. + uint8_t* mutable_data() { +#ifndef NDEBUG + CheckCPU(); + CheckMutable(); +#endif + return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast(data_) + : NULLPTR; + } + + /// \brief Return a writable pointer to the buffer's data cast to a specific type + /// + /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()` + /// are true). Otherwise, an assertion may be thrown or a null pointer may + /// be returned. + template + T* mutable_data_as() { + return reinterpret_cast(mutable_data()); + } + + /// \brief Return the buffer's mutable data as a span + template + util::span mutable_span_as() { + return util::span(mutable_data_as(), static_cast(size() / sizeof(T))); + } + + /// \brief Return the device address of the buffer's data + uintptr_t address() const { return reinterpret_cast(data_); } + + /// \brief Return a writable device address to the buffer's data + /// + /// The buffer has to be a mutable buffer (`is_mutable()` is true). + /// Otherwise, an assertion may be thrown or 0 may be returned. + uintptr_t mutable_address() const { +#ifndef NDEBUG + CheckMutable(); +#endif + return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast(data_) : 0; + } + + /// \brief Return the buffer's size in bytes + int64_t size() const { return size_; } + + /// \brief Return the buffer's capacity (number of allocated bytes) + int64_t capacity() const { return capacity_; } + + /// \brief Whether the buffer is directly CPU-accessible + /// + /// If this function returns true, you can read directly from the buffer's + /// `data()` pointer. Otherwise, you'll have to `View()` or `Copy()` it. + bool is_cpu() const { return is_cpu_; } + + /// \brief Whether the buffer is mutable + /// + /// If this function returns true, you are allowed to modify buffer contents + /// using the pointer returned by `mutable_data()` or `mutable_address()`. + bool is_mutable() const { return is_mutable_; } + + const std::shared_ptr& device() const { return memory_manager_->device(); } + + const std::shared_ptr& memory_manager() const { return memory_manager_; } + + DeviceAllocationType device_type() const { return device_type_; } + + std::shared_ptr parent() const { return parent_; } + + /// \brief Get a RandomAccessFile for reading a buffer + /// + /// The returned file object reads from this buffer's underlying memory. + static Result> GetReader(std::shared_ptr); + + /// \brief Get a OutputStream for writing to a buffer + /// + /// The buffer must be mutable. The returned stream object writes into the buffer's + /// underlying memory (but it won't resize it). + static Result> GetWriter(std::shared_ptr); + + /// \brief Copy buffer + /// + /// The buffer contents will be copied into a new buffer allocated by the + /// given MemoryManager. This function supports cross-device copies. + static Result> Copy(std::shared_ptr source, + const std::shared_ptr& to); + + /// \brief Copy a non-owned buffer + /// + /// This is useful for cases where the source memory area is externally managed + /// (its lifetime not tied to the source Buffer), otherwise please use Copy(). + static Result> CopyNonOwned( + const Buffer& source, const std::shared_ptr& to); + + /// \brief View buffer + /// + /// Return a Buffer that reflects this buffer, seen potentially from another + /// device, without making an explicit copy of the contents. The underlying + /// mechanism is typically implemented by the kernel or device driver, and may + /// involve lazy caching of parts of the buffer contents on the destination + /// device's memory. + /// + /// If a non-copy view is unsupported for the buffer on the given device, + /// nullptr is returned. An error can be returned if some low-level + /// operation fails (such as an out-of-memory condition). + static Result> View(std::shared_ptr source, + const std::shared_ptr& to); + + /// \brief View or copy buffer + /// + /// Try to view buffer contents on the given MemoryManager's device, but + /// fall back to copying if a no-copy view isn't supported. + static Result> ViewOrCopy( + std::shared_ptr source, const std::shared_ptr& to); + + virtual std::shared_ptr device_sync_event() const { return NULLPTR; } + + protected: + bool is_mutable_; + bool is_cpu_; + const uint8_t* data_; + int64_t size_; + int64_t capacity_; + DeviceAllocationType device_type_; + + // null by default, but may be set + std::shared_ptr parent_; + + private: + // private so that subclasses are forced to call SetMemoryManager() + std::shared_ptr memory_manager_; + + protected: + Buffer(); + + void CheckMutable() const; + void CheckCPU() const; + + void SetMemoryManager(std::shared_ptr mm) { + memory_manager_ = std::move(mm); + is_cpu_ = memory_manager_->is_cpu(); + device_type_ = memory_manager_->device()->device_type(); + } +}; + +/// \defgroup buffer-slicing-functions Functions for slicing buffers +/// +/// @{ + +/// \brief Construct a view on a buffer at the given offset and length. +/// +/// This function cannot fail and does not check for errors (except in debug builds) +static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, + const int64_t offset, + const int64_t length) { + return std::make_shared(buffer, offset, length); +} + +/// \brief Construct a view on a buffer at the given offset, up to the buffer's end. +/// +/// This function cannot fail and does not check for errors (except in debug builds) +static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, + const int64_t offset) { + int64_t length = buffer->size() - offset; + return SliceBuffer(buffer, offset, length); +} + +/// \brief Input-checking version of SliceBuffer +/// +/// An Invalid Status is returned if the requested slice falls out of bounds. +ARROW_EXPORT +Result> SliceBufferSafe(const std::shared_ptr& buffer, + int64_t offset); +/// \brief Input-checking version of SliceBuffer +/// +/// An Invalid Status is returned if the requested slice falls out of bounds. +/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size. +ARROW_EXPORT +Result> SliceBufferSafe(const std::shared_ptr& buffer, + int64_t offset, int64_t length); + +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). +ARROW_EXPORT +std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, + const int64_t offset, const int64_t length); + +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). +static inline std::shared_ptr SliceMutableBuffer( + const std::shared_ptr& buffer, const int64_t offset) { + int64_t length = buffer->size() - offset; + return SliceMutableBuffer(buffer, offset, length); +} + +/// \brief Input-checking version of SliceMutableBuffer +/// +/// An Invalid Status is returned if the requested slice falls out of bounds. +ARROW_EXPORT +Result> SliceMutableBufferSafe( + const std::shared_ptr& buffer, int64_t offset); +/// \brief Input-checking version of SliceMutableBuffer +/// +/// An Invalid Status is returned if the requested slice falls out of bounds. +/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size. +ARROW_EXPORT +Result> SliceMutableBufferSafe( + const std::shared_ptr& buffer, int64_t offset, int64_t length); + +/// @} + +/// \class MutableBuffer +/// \brief A Buffer whose contents can be mutated. May or may not own its data. +class ARROW_EXPORT MutableBuffer : public Buffer { + public: + MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) { + is_mutable_ = true; + } + + MutableBuffer(uint8_t* data, const int64_t size, std::shared_ptr mm) + : Buffer(data, size, std::move(mm)) { + is_mutable_ = true; + } + + MutableBuffer(const std::shared_ptr& parent, const int64_t offset, + const int64_t size); + + /// \brief Create buffer referencing typed memory with some length + /// \param[in] data the typed memory as C array + /// \param[in] length the number of values in the array + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(T* data, SizeType length) { + return std::make_shared(reinterpret_cast(data), + static_cast(sizeof(T) * length)); + } + + protected: + MutableBuffer() : Buffer(NULLPTR, 0) {} +}; + +/// \class ResizableBuffer +/// \brief A mutable buffer that can be resized +class ARROW_EXPORT ResizableBuffer : public MutableBuffer { + public: + /// Change buffer reported size to indicated size, allocating memory if + /// necessary. This will ensure that the capacity of the buffer is a multiple + /// of 64 bytes as defined in Layout.md. + /// Consider using ZeroPadding afterwards, to conform to the Arrow layout + /// specification. + /// + /// @param new_size The new size for the buffer. + /// @param shrink_to_fit Whether to shrink the capacity if new size < current size + virtual Status Resize(const int64_t new_size, bool shrink_to_fit) = 0; + Status Resize(const int64_t new_size) { + return Resize(new_size, /*shrink_to_fit=*/true); + } + + /// Ensure that buffer has enough memory allocated to fit the indicated + /// capacity (and meets the 64 byte padding requirement in Layout.md). + /// It does not change buffer's reported size and doesn't zero the padding. + virtual Status Reserve(const int64_t new_capacity) = 0; + + template + Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) { + return Resize(sizeof(T) * new_nb_elements, shrink_to_fit); + } + + template + Status TypedReserve(const int64_t new_nb_elements) { + return Reserve(sizeof(T) * new_nb_elements); + } + + protected: + ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} + ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr mm) + : MutableBuffer(data, size, std::move(mm)) {} +}; + +/// \defgroup buffer-allocation-functions Functions for allocating buffers +/// +/// @{ + +/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. +/// +/// \param[in] size size of buffer to allocate +/// \param[in] pool a memory pool +ARROW_EXPORT +Result> AllocateBuffer(const int64_t size, + MemoryPool* pool = NULLPTR); +ARROW_EXPORT +Result> AllocateBuffer(const int64_t size, int64_t alignment, + MemoryPool* pool = NULLPTR); + +/// \brief Allocate a resizeable buffer from a memory pool, zero its padding. +/// +/// \param[in] size size of buffer to allocate +/// \param[in] pool a memory pool +ARROW_EXPORT +Result> AllocateResizableBuffer( + const int64_t size, MemoryPool* pool = NULLPTR); +ARROW_EXPORT +Result> AllocateResizableBuffer( + const int64_t size, const int64_t alignment, MemoryPool* pool = NULLPTR); + +/// \brief Allocate a bitmap buffer from a memory pool +/// no guarantee on values is provided. +/// +/// \param[in] length size in bits of bitmap to allocate +/// \param[in] pool memory pool to allocate memory from +ARROW_EXPORT +Result> AllocateBitmap(int64_t length, + MemoryPool* pool = NULLPTR); + +/// \brief Allocate a zero-initialized bitmap buffer from a memory pool +/// +/// \param[in] length size in bits of bitmap to allocate +/// \param[in] pool memory pool to allocate memory from +ARROW_EXPORT +Result> AllocateEmptyBitmap(int64_t length, + MemoryPool* pool = NULLPTR); + +ARROW_EXPORT +Result> AllocateEmptyBitmap(int64_t length, int64_t alignment, + MemoryPool* pool = NULLPTR); + +/// \brief Concatenate multiple buffers into a single buffer +/// +/// \param[in] buffers to be concatenated +/// \param[in] pool memory pool to allocate the new buffer from +ARROW_EXPORT +Result> ConcatenateBuffers(const BufferVector& buffers, + MemoryPool* pool = NULLPTR); + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/buffer_builder.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/buffer_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..a84c98b6b24917faf53a821c5c3e5f62471bb9aa --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/buffer_builder.h @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_generate.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/macros.h" +#include "arrow/util/ubsan.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Buffer builder classes + +/// \class BufferBuilder +/// \brief A class for incrementally building a contiguous chunk of in-memory +/// data +class ARROW_EXPORT BufferBuilder { + public: + explicit BufferBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : pool_(pool), + data_(/*ensure never null to make ubsan happy and avoid check penalties below*/ + util::MakeNonNull()), + capacity_(0), + size_(0), + alignment_(alignment) {} + + /// \brief Constructs new Builder that will start using + /// the provided buffer until Finish/Reset are called. + /// The buffer is not resized. + explicit BufferBuilder(std::shared_ptr buffer, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : buffer_(std::move(buffer)), + pool_(pool), + data_(buffer_->mutable_data()), + capacity_(buffer_->capacity()), + size_(buffer_->size()), + alignment_(alignment) {} + + /// \brief Resize the buffer to the nearest multiple of 64 bytes + /// + /// \param new_capacity the new capacity of the of the builder. Will be + /// rounded up to a multiple of 64 bytes for padding + /// \param shrink_to_fit if new capacity is smaller than the existing, + /// reallocate internal buffer. Set to false to avoid reallocations when + /// shrinking the builder. + /// \return Status + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + if (buffer_ == NULLPTR) { + ARROW_ASSIGN_OR_RAISE(buffer_, + AllocateResizableBuffer(new_capacity, alignment_, pool_)); + } else { + ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit)); + } + capacity_ = buffer_->capacity(); + data_ = buffer_->mutable_data(); + return Status::OK(); + } + + /// \brief Ensure that builder can accommodate the additional number of bytes + /// without the need to perform allocations + /// + /// \param[in] additional_bytes number of additional bytes to make space for + /// \return Status + Status Reserve(const int64_t additional_bytes) { + auto min_capacity = size_ + additional_bytes; + if (min_capacity <= capacity_) { + return Status::OK(); + } + return Resize(GrowByFactor(capacity_, min_capacity), false); + } + + /// \brief Return a capacity expanded by the desired growth factor + static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) { + // Doubling capacity except for large Reserve requests. 2x growth strategy + // (versus 1.5x) seems to have slightly better performance when using + // jemalloc, but significantly better performance when using the system + // allocator. See ARROW-6450 for further discussion + return std::max(new_capacity, current_capacity * 2); + } + + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(const void* data, const int64_t length) { + if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) { + ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false)); + } + UnsafeAppend(data, length); + return Status::OK(); + } + + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(std::string_view v) { return Append(v.data(), v.size()); } + + /// \brief Append copies of a value to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(const int64_t num_copies, uint8_t value) { + ARROW_RETURN_NOT_OK(Reserve(num_copies)); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + // Advance pointer and zero out memory + Status Advance(const int64_t length) { return Append(length, 0); } + + // Advance pointer, but don't allocate or zero memory + void UnsafeAdvance(const int64_t length) { size_ += length; } + + // Unsafe methods don't check existing size + void UnsafeAppend(const void* data, const int64_t length) { + memcpy(data_ + size_, data, static_cast(length)); + size_ += length; + } + + void UnsafeAppend(std::string_view v) { + UnsafeAppend(v.data(), static_cast(v.size())); + } + + void UnsafeAppend(const int64_t num_copies, uint8_t value) { + memset(data_ + size_, value, static_cast(num_copies)); + size_ += num_copies; + } + + /// \brief Return result of builder as a Buffer object. + /// + /// The builder is reset and can be reused afterwards. + /// + /// \param[out] out the finalized Buffer object + /// \param shrink_to_fit if the buffer size is smaller than its capacity, + /// reallocate to fit more tightly in memory. Set to false to avoid + /// a reallocation, at the expense of potentially more memory consumption. + /// \return Status + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); + if (size_ != 0) buffer_->ZeroPadding(); + *out = buffer_; + if (*out == NULLPTR) { + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, alignment_, pool_)); + } + Reset(); + return Status::OK(); + } + + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + + /// \brief Like Finish, but override the final buffer size + /// + /// This is useful after writing data directly into the builder memory + /// without calling the Append methods (basically, when using BufferBuilder + /// mostly for memory allocation). + Result> FinishWithLength(int64_t final_length, + bool shrink_to_fit = true) { + size_ = final_length; + return Finish(shrink_to_fit); + } + + void Reset() { + buffer_ = NULLPTR; + capacity_ = size_ = 0; + } + + /// \brief Set size to a smaller value without modifying builder + /// contents. For reusable BufferBuilder classes + /// \param[in] position must be non-negative and less than or equal + /// to the current length() + void Rewind(int64_t position) { size_ = position; } + + int64_t capacity() const { return capacity_; } + int64_t length() const { return size_; } + const uint8_t* data() const { return data_; } + uint8_t* mutable_data() { return data_; } + template + const T* data_as() const { + return reinterpret_cast(data_); + } + template + T* mutable_data_as() { + return reinterpret_cast(data_); + } + + private: + std::shared_ptr buffer_; + MemoryPool* pool_; + uint8_t* data_; + int64_t capacity_; + int64_t size_; + int64_t alignment_; +}; + +template +class TypedBufferBuilder; + +/// \brief A BufferBuilder for building a buffer of arithmetic elements +template +class TypedBufferBuilder< + T, typename std::enable_if::value || + std::is_standard_layout::value>::type> { + public: + explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : bytes_builder_(pool, alignment) {} + + explicit TypedBufferBuilder(std::shared_ptr buffer, + MemoryPool* pool = default_memory_pool()) + : bytes_builder_(std::move(buffer), pool) {} + + explicit TypedBufferBuilder(BufferBuilder builder) + : bytes_builder_(std::move(builder)) {} + + BufferBuilder* bytes_builder() { return &bytes_builder_; } + + Status Append(T value) { + return bytes_builder_.Append(reinterpret_cast(&value), sizeof(T)); + } + + Status Append(const T* values, int64_t num_elements) { + return bytes_builder_.Append(reinterpret_cast(values), + num_elements * sizeof(T)); + } + + Status Append(const int64_t num_copies, T value) { + ARROW_RETURN_NOT_OK(Reserve(num_copies + length())); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + void UnsafeAppend(T value) { + bytes_builder_.UnsafeAppend(reinterpret_cast(&value), sizeof(T)); + } + + void UnsafeAppend(const T* values, int64_t num_elements) { + bytes_builder_.UnsafeAppend(reinterpret_cast(values), + num_elements * sizeof(T)); + } + + template + void UnsafeAppend(Iter values_begin, Iter values_end) { + auto num_elements = static_cast(std::distance(values_begin, values_end)); + auto data = mutable_data() + length(); + bytes_builder_.UnsafeAdvance(num_elements * sizeof(T)); + std::copy(values_begin, values_end, data); + } + + void UnsafeAppend(const int64_t num_copies, T value) { + auto data = mutable_data() + length(); + bytes_builder_.UnsafeAdvance(num_copies * sizeof(T)); + std::fill(data, data + num_copies, value); + } + + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit); + } + + Status Reserve(const int64_t additional_elements) { + return bytes_builder_.Reserve(additional_elements * sizeof(T)); + } + + Status Advance(const int64_t length) { + return bytes_builder_.Advance(length * sizeof(T)); + } + + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + return bytes_builder_.Finish(out, shrink_to_fit); + } + + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + + /// \brief Like Finish, but override the final buffer size + /// + /// This is useful after writing data directly into the builder memory + /// without calling the Append methods (basically, when using TypedBufferBuilder + /// only for memory allocation). + Result> FinishWithLength(int64_t final_length, + bool shrink_to_fit = true) { + return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit); + } + + void Reset() { bytes_builder_.Reset(); } + + int64_t length() const { return bytes_builder_.length() / sizeof(T); } + int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); } + const T* data() const { return reinterpret_cast(bytes_builder_.data()); } + T* mutable_data() { return reinterpret_cast(bytes_builder_.mutable_data()); } + + private: + BufferBuilder bytes_builder_; +}; + +/// \brief A BufferBuilder for building a buffer containing a bitmap +template <> +class TypedBufferBuilder { + public: + explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : bytes_builder_(pool, alignment) {} + + explicit TypedBufferBuilder(BufferBuilder builder) + : bytes_builder_(std::move(builder)) {} + + BufferBuilder* bytes_builder() { return &bytes_builder_; } + + Status Append(bool value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(value); + return Status::OK(); + } + + Status Append(const uint8_t* valid_bytes, int64_t num_elements) { + ARROW_RETURN_NOT_OK(Reserve(num_elements)); + UnsafeAppend(valid_bytes, num_elements); + return Status::OK(); + } + + Status Append(const int64_t num_copies, bool value) { + ARROW_RETURN_NOT_OK(Reserve(num_copies)); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + void UnsafeAppend(bool value) { + bit_util::SetBitTo(mutable_data(), bit_length_, value); + if (!value) { + ++false_count_; + } + ++bit_length_; + } + + /// \brief Append bits from an array of bytes (one value per byte) + void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) { + if (num_elements == 0) return; + int64_t i = 0; + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { + bool value = bytes[i++]; + false_count_ += !value; + return value; + }); + bit_length_ += num_elements; + } + + /// \brief Append bits from a packed bitmap + void UnsafeAppend(const uint8_t* bitmap, int64_t offset, int64_t num_elements) { + if (num_elements == 0) return; + internal::CopyBitmap(bitmap, offset, num_elements, mutable_data(), bit_length_); + false_count_ += num_elements - internal::CountSetBits(bitmap, offset, num_elements); + bit_length_ += num_elements; + } + + void UnsafeAppend(const int64_t num_copies, bool value) { + bit_util::SetBitsTo(mutable_data(), bit_length_, num_copies, value); + false_count_ += num_copies * !value; + bit_length_ += num_copies; + } + + template + void UnsafeAppend(const int64_t num_elements, Generator&& gen) { + if (num_elements == 0) return; + + if (count_falses) { + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { + bool value = gen(); + false_count_ += !value; + return value; + }); + } else { + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, + std::forward(gen)); + } + bit_length_ += num_elements; + } + + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + const int64_t old_byte_capacity = bytes_builder_.capacity(); + ARROW_RETURN_NOT_OK( + bytes_builder_.Resize(bit_util::BytesForBits(new_capacity), shrink_to_fit)); + // Resize() may have chosen a larger capacity (e.g. for padding), + // so ask it again before calling memset(). + const int64_t new_byte_capacity = bytes_builder_.capacity(); + if (new_byte_capacity > old_byte_capacity) { + // The additional buffer space is 0-initialized for convenience, + // so that other methods can simply bump the length. + memset(mutable_data() + old_byte_capacity, 0, + static_cast(new_byte_capacity - old_byte_capacity)); + } + return Status::OK(); + } + + Status Reserve(const int64_t additional_elements) { + return Resize( + BufferBuilder::GrowByFactor(bit_length_, bit_length_ + additional_elements), + false); + } + + Status Advance(const int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + bit_length_ += length; + false_count_ += length; + return Status::OK(); + } + + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + // set bytes_builder_.size_ == byte size of data + bytes_builder_.UnsafeAdvance(bit_util::BytesForBits(bit_length_) - + bytes_builder_.length()); + bit_length_ = false_count_ = 0; + return bytes_builder_.Finish(out, shrink_to_fit); + } + + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + + /// \brief Like Finish, but override the final buffer size + /// + /// This is useful after writing data directly into the builder memory + /// without calling the Append methods (basically, when using TypedBufferBuilder + /// only for memory allocation). + Result> FinishWithLength(int64_t final_length, + bool shrink_to_fit = true) { + const auto final_byte_length = bit_util::BytesForBits(final_length); + bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length()); + bit_length_ = false_count_ = 0; + return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit); + } + + void Reset() { + bytes_builder_.Reset(); + bit_length_ = false_count_ = 0; + } + + int64_t length() const { return bit_length_; } + int64_t capacity() const { return bytes_builder_.capacity() * 8; } + const uint8_t* data() const { return bytes_builder_.data(); } + uint8_t* mutable_data() { return bytes_builder_.mutable_data(); } + int64_t false_count() const { return false_count_; } + + private: + BufferBuilder bytes_builder_; + int64_t bit_length_ = 0; + int64_t false_count_ = 0; +}; + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/builder.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/builder.h new file mode 100644 index 0000000000000000000000000000000000000000..f0aa14c1e0612d1872a5959998651a12668f449f --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/builder.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_adaptive.h" // IWYU pragma: keep +#include "arrow/array/builder_base.h" // IWYU pragma: keep +#include "arrow/array/builder_binary.h" // IWYU pragma: keep +#include "arrow/array/builder_decimal.h" // IWYU pragma: keep +#include "arrow/array/builder_dict.h" // IWYU pragma: keep +#include "arrow/array/builder_nested.h" // IWYU pragma: keep +#include "arrow/array/builder_primitive.h" // IWYU pragma: keep +#include "arrow/array/builder_run_end.h" // IWYU pragma: keep +#include "arrow/array/builder_time.h" // IWYU pragma: keep +#include "arrow/array/builder_union.h" // IWYU pragma: keep +#include "arrow/status.h" +#include "arrow/util/visibility.h" diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/abi.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/abi.h new file mode 100644 index 0000000000000000000000000000000000000000..db051fff5ff058fb432cfe014eda1cd9392c0f1f --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/abi.h @@ -0,0 +1,233 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// \file abi.h Arrow C Data Interface +/// +/// The Arrow C Data interface defines a very small, stable set +/// of C definitions which can be easily copied into any project's +/// source code and vendored to be used for columnar data interchange +/// in the Arrow format. For non-C/C++ languages and runtimes, +/// it should be almost as easy to translate the C definitions into +/// the corresponding C FFI declarations. +/// +/// Applications and libraries can therefore work with Arrow memory +/// without necessarily using the Arrow libraries or reinventing +/// the wheel. Developers can choose between tight integration +/// with the Arrow software project or minimal integration with +/// the Arrow format only. + +#pragma once + +#include + +// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef ARROW_C_DATA_INTERFACE +# define ARROW_C_DATA_INTERFACE + +# define ARROW_FLAG_DICTIONARY_ORDERED 1 +# define ARROW_FLAG_NULLABLE 2 +# define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_DEVICE_DATA_INTERFACE +# define ARROW_C_DEVICE_DATA_INTERFACE + +// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html + +// DeviceType for the allocated memory +typedef int32_t ArrowDeviceType; + +// CPU device, same as using ArrowArray directly +# define ARROW_DEVICE_CPU 1 +// CUDA GPU Device +# define ARROW_DEVICE_CUDA 2 +// Pinned CUDA CPU memory by cudaMallocHost +# define ARROW_DEVICE_CUDA_HOST 3 +// OpenCL Device +# define ARROW_DEVICE_OPENCL 4 +// Vulkan buffer for next-gen graphics +# define ARROW_DEVICE_VULKAN 7 +// Metal for Apple GPU +# define ARROW_DEVICE_METAL 8 +// Verilog simulator buffer +# define ARROW_DEVICE_VPI 9 +// ROCm GPUs for AMD GPUs +# define ARROW_DEVICE_ROCM 10 +// Pinned ROCm CPU memory allocated by hipMallocHost +# define ARROW_DEVICE_ROCM_HOST 11 +// Reserved for extension +# define ARROW_DEVICE_EXT_DEV 12 +// CUDA managed/unified memory allocated by cudaMallocManaged +# define ARROW_DEVICE_CUDA_MANAGED 13 +// unified shared memory allocated on a oneAPI non-partitioned device. +# define ARROW_DEVICE_ONEAPI 14 +// GPU support for next-gen WebGPU standard +# define ARROW_DEVICE_WEBGPU 15 +// Qualcomm Hexagon DSP +# define ARROW_DEVICE_HEXAGON 16 + +struct ArrowDeviceArray { + // the Allocated Array + // + // the buffers in the array (along with the buffers of any + // children) are what is allocated on the device. + struct ArrowArray array; + // The device id to identify a specific device + int64_t device_id; + // The type of device which can access this memory. + ArrowDeviceType device_type; + // An event-like object to synchronize on if needed. + void* sync_event; + // Reserved bytes for future expansion. + int64_t reserved[3]; +}; + +#endif // ARROW_C_DEVICE_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +# define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE + +#ifndef ARROW_C_DEVICE_STREAM_INTERFACE +# define ARROW_C_DEVICE_STREAM_INTERFACE + +// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays. +// +// This stream is intended to provide a stream of data on a single +// device, if a producer wants data to be produced on multiple devices +// then multiple streams should be provided. One per device. +struct ArrowDeviceArrayStream { + // The device that this stream produces data on. + ArrowDeviceType device_type; + + // Callback to get the stream schema + // (will be the same for all arrays in the stream). + // + // Return value 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + // The schema should be accessible via CPU memory. + int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowDeviceArray must be released independently from the stream. + int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowDeviceArrayStream* self); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowDeviceArrayStream* self); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DEVICE_STREAM_INTERFACE + +#ifdef __cplusplus +} +#endif diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/bridge.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/bridge.h new file mode 100644 index 0000000000000000000000000000000000000000..45367e4f9306247fa8f06cbbf101dab8f498ac96 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/bridge.h @@ -0,0 +1,409 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/c/abi.h" +#include "arrow/device.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \defgroup c-data-interface Functions for working with the C data interface. +/// +/// @{ + +/// \brief Export C++ DataType using the C data interface format. +/// +/// The root type is considered to have empty name and metadata. +/// If you want the root type to have a name and/or metadata, pass +/// a Field instead. +/// +/// \param[in] type DataType object to export +/// \param[out] out C struct where to export the datatype +ARROW_EXPORT +Status ExportType(const DataType& type, struct ArrowSchema* out); + +/// \brief Export C++ Field using the C data interface format. +/// +/// \param[in] field Field object to export +/// \param[out] out C struct where to export the field +ARROW_EXPORT +Status ExportField(const Field& field, struct ArrowSchema* out); + +/// \brief Export C++ Schema using the C data interface format. +/// +/// \param[in] schema Schema object to export +/// \param[out] out C struct where to export the field +ARROW_EXPORT +Status ExportSchema(const Schema& schema, struct ArrowSchema* out); + +/// \brief Export C++ Array using the C data interface format. +/// +/// The resulting ArrowArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] array Array object to export +/// \param[out] out C struct where to export the array +/// \param[out] out_schema optional C struct where to export the array type +ARROW_EXPORT +Status ExportArray(const Array& array, struct ArrowArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief Export C++ RecordBatch using the C data interface format. +/// +/// The record batch is exported as if it were a struct array. +/// The resulting ArrowArray struct keeps the record batch data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] batch Record batch to export +/// \param[out] out C struct where to export the record batch +/// \param[out] out_schema optional C struct where to export the record batch schema +ARROW_EXPORT +Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief Import C++ DataType from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the data type +/// \return Imported type object +ARROW_EXPORT +Result> ImportType(struct ArrowSchema* schema); + +/// \brief Import C++ Field from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the field +/// \return Imported field object +ARROW_EXPORT +Result> ImportField(struct ArrowSchema* schema); + +/// \brief Import C++ Schema from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the field +/// \return Imported field object +ARROW_EXPORT +Result> ImportSchema(struct ArrowSchema* schema); + +/// \brief Import C++ array from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in] type type of the imported array +/// \return Imported array object +ARROW_EXPORT +Result> ImportArray(struct ArrowArray* array, + std::shared_ptr type); + +/// \brief Import C++ array and its type from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// The ArrowSchema struct is released, even if this function fails. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in,out] type C data interface struct holding the array type +/// \return Imported array object +ARROW_EXPORT +Result> ImportArray(struct ArrowArray* array, + struct ArrowSchema* type); + +/// \brief Import C++ record batch from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in] schema schema of the imported record batch +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportRecordBatch(struct ArrowArray* array, + std::shared_ptr schema); + +/// \brief Import C++ record batch and its schema from the C data interface. +/// +/// The type represented by the ArrowSchema struct must be a struct type array. +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The ArrowSchema struct is released, even if this function fails. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in,out] schema C data interface struct holding the record batch schema +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportRecordBatch(struct ArrowArray* array, + struct ArrowSchema* schema); + +/// @} + +/// \defgroup c-data-device-interface Functions for working with the C data device +/// interface. +/// +/// @{ + +/// \brief EXPERIMENTAL: Export C++ Array as an ArrowDeviceArray. +/// +/// The resulting ArrowDeviceArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. All buffers in +/// the provided array MUST have the same device_type, otherwise an error +/// will be returned. +/// +/// If sync is non-null, get_event will be called on it in order to +/// potentially provide an event for consumers to synchronize on. +/// +/// \param[in] array Array object to export +/// \param[in] sync shared_ptr to object derived from Device::SyncEvent or null +/// \param[out] out C struct to export the array to +/// \param[out] out_schema optional C struct to export the array type to +ARROW_EXPORT +Status ExportDeviceArray(const Array& array, std::shared_ptr sync, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief EXPERIMENTAL: Export C++ RecordBatch as an ArrowDeviceArray. +/// +/// The record batch is exported as if it were a struct array. +/// The resulting ArrowDeviceArray struct keeps the record batch data and buffers alive +/// until its release callback is called by the consumer. +/// +/// All buffers of all columns in the record batch must have the same device_type +/// otherwise an error will be returned. If columns are on different devices, +/// they should be exported using different ArrowDeviceArray instances. +/// +/// If sync is non-null, get_event will be called on it in order to +/// potentially provide an event for consumers to synchronize on. +/// +/// \param[in] batch Record batch to export +/// \param[in] sync shared_ptr to object derived from Device::SyncEvent or null +/// \param[out] out C struct where to export the record batch +/// \param[out] out_schema optional C struct where to export the record batch schema +ARROW_EXPORT +Status ExportDeviceRecordBatch(const RecordBatch& batch, + std::shared_ptr sync, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +using DeviceMemoryMapper = + std::function>(ArrowDeviceType, int64_t)>; + +ARROW_EXPORT +Result> DefaultDeviceMemoryMapper( + ArrowDeviceType device_type, int64_t device_id); + +/// \brief EXPERIMENTAL: Import C++ device array from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. The +/// buffers of the Array are located on the device indicated by the device_type. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in] type type of the imported array +/// \param[in] mapper A function to map device + id to memory manager. If not +/// specified, defaults to map "cpu" to the built-in default memory manager. +/// \return Imported array object +ARROW_EXPORT +Result> ImportDeviceArray( + struct ArrowDeviceArray* array, std::shared_ptr type, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief EXPERIMENTAL: Import C++ device array and its type from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// The ArrowSchema struct is released, even if this function fails. The +/// buffers of the Array are located on the device indicated by the device_type. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in,out] type C data interface struct holding the array type +/// \param[in] mapper A function to map device + id to memory manager. If not +/// specified, defaults to map "cpu" to the built-in default memory manager. +/// \return Imported array object +ARROW_EXPORT +Result> ImportDeviceArray( + struct ArrowDeviceArray* array, struct ArrowSchema* type, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device from the C data +/// interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The buffers of all columns of the record batch are located on the device +/// indicated by the device type. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in] schema schema of the imported record batch +/// \param[in] mapper A function to map device + id to memory manager. If not +/// specified, defaults to map "cpu" to the built-in default memory manager. +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, std::shared_ptr schema, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device and its schema +/// from the C data interface. +/// +/// The type represented by the ArrowSchema struct must be a struct type array. +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The ArrowSchema struct is released, even if this function fails. The buffers +/// of all columns of the record batch are located on the device indicated by the +/// device type. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in,out] schema C data interface struct holding the record batch schema +/// \param[in] mapper A function to map device + id to memory manager. If not +/// specified, defaults to map "cpu" to the built-in default memory manager. +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, struct ArrowSchema* schema, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// @} + +/// \defgroup c-stream-interface Functions for working with the C data interface. +/// +/// @{ + +/// \brief Export C++ RecordBatchReader using the C stream interface. +/// +/// The resulting ArrowArrayStream struct keeps the record batch reader alive +/// until its release callback is called by the consumer. +/// +/// \param[in] reader RecordBatchReader object to export +/// \param[out] out C struct where to export the stream +ARROW_EXPORT +Status ExportRecordBatchReader(std::shared_ptr reader, + struct ArrowArrayStream* out); + +/// \brief Export C++ ChunkedArray using the C data interface format. +/// +/// The resulting ArrowArrayStream struct keeps the chunked array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] chunked_array ChunkedArray object to export +/// \param[out] out C struct where to export the stream +ARROW_EXPORT +Status ExportChunkedArray(std::shared_ptr chunked_array, + struct ArrowArrayStream* out); + +/// \brief Export C++ RecordBatchReader using the C device stream interface +/// +/// The resulting ArrowDeviceArrayStream struct keeps the record batch reader +/// alive until its release callback is called by the consumer. The device +/// type is determined by calling device_type() on the RecordBatchReader. +/// +/// \param[in] reader RecordBatchReader object to export +/// \param[out] out C struct to export the stream to +ARROW_EXPORT +Status ExportDeviceRecordBatchReader(std::shared_ptr reader, + struct ArrowDeviceArrayStream* out); + +/// \brief Export C++ ChunkedArray using the C device data interface format. +/// +/// The resulting ArrowDeviceArrayStream keeps the chunked array data and buffers +/// alive until its release callback is called by the consumer. +/// +/// \param[in] chunked_array ChunkedArray object to export +/// \param[in] device_type the device type the data is located on +/// \param[out] out C struct to export the stream to +ARROW_EXPORT +Status ExportDeviceChunkedArray(std::shared_ptr chunked_array, + DeviceAllocationType device_type, + struct ArrowDeviceArrayStream* out); + +/// \brief Import C++ RecordBatchReader from the C stream interface. +/// +/// The ArrowArrayStream struct has its contents moved to a private object +/// held alive by the resulting record batch reader. +/// +/// \param[in,out] stream C stream interface struct +/// \return Imported RecordBatchReader object +ARROW_EXPORT +Result> ImportRecordBatchReader( + struct ArrowArrayStream* stream); + +/// \brief Import C++ ChunkedArray from the C stream interface +/// +/// The ArrowArrayStream struct has its contents moved to a private object, +/// is consumed in its entirity, and released before returning all chunks +/// as a ChunkedArray. +/// +/// \param[in,out] stream C stream interface struct +/// \return Imported ChunkedArray object +ARROW_EXPORT +Result> ImportChunkedArray(struct ArrowArrayStream* stream); + +/// \brief Import C++ RecordBatchReader from the C device stream interface +/// +/// The ArrowDeviceArrayStream struct has its contents moved to a private object +/// held alive by the resulting record batch reader. +/// +/// \note If there was a required sync event, sync events are accessible by individual +/// buffers of columns. We are not yet bubbling the sync events from the buffers up to +/// the `GetSyncEvent` method of an imported RecordBatch. This will be added in a future +/// update. +/// +/// \param[in,out] stream C device stream interface struct +/// \param[in] mapper mapping from device type and ID to memory manager +/// \return Imported RecordBatchReader object +ARROW_EXPORT +Result> ImportDeviceRecordBatchReader( + struct ArrowDeviceArrayStream* stream, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief Import C++ ChunkedArray from the C device stream interface +/// +/// The ArrowDeviceArrayStream struct has its contents moved to a private object, +/// is consumed in its entirety, and released before returning all chunks as a +/// ChunkedArray. +/// +/// \note Any chunks that require synchronization for their device memory will have +/// the SyncEvent objects available by checking the individual buffers of each chunk. +/// These SyncEvents should be checked before accessing the data in those buffers. +/// +/// \param[in,out] stream C device stream interface struct +/// \param[in] mapper mapping from device type and ID to memory manager +/// \return Imported ChunkedArray object +ARROW_EXPORT +Result> ImportDeviceChunkedArray( + struct ArrowDeviceArrayStream* stream, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// @} + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack.h new file mode 100644 index 0000000000000000000000000000000000000000..d11ccfc1fd72253600501d7de3a150944608ca06 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array/array_base.h" +#include "arrow/c/dlpack_abi.h" + +namespace arrow::dlpack { + +/// \brief Export Arrow array as DLPack tensor. +/// +/// DLMangedTensor is produced as defined by the DLPack protocol, +/// see https://dmlc.github.io/dlpack/latest/. +/// +/// Data types for which the protocol is supported are +/// integer and floating-point data types. +/// +/// DLPack protocol only supports arrays with one contiguous +/// memory region which means Arrow Arrays with validity buffers +/// are not supported. +/// +/// \param[in] arr Arrow array +/// \return DLManagedTensor struct +ARROW_EXPORT +Result ExportArray(const std::shared_ptr& arr); + +/// \brief Get DLDevice with enumerator specifying the +/// type of the device data is stored on and index of the +/// device which is 0 by default for CPU. +/// +/// \param[in] arr Arrow array +/// \return DLDevice struct +ARROW_EXPORT +Result ExportDevice(const std::shared_ptr& arr); + +} // namespace arrow::dlpack diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack_abi.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack_abi.h new file mode 100644 index 0000000000000000000000000000000000000000..fbe2a56a344b373f3d3e950e434ba5392036a080 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack_abi.h @@ -0,0 +1,321 @@ +// Taken from: +// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +# define DLPACK_EXTERN_C extern "C" +#else +# define DLPACK_EXTERN_C +#endif + +/*! \brief The current major version of dlpack */ +#define DLPACK_MAJOR_VERSION 1 + +/*! \brief The current minor version of dlpack */ +#define DLPACK_MINOR_VERSION 0 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +# ifdef DLPACK_EXPORTS +# define DLPACK_DLL __declspec(dllexport) +# else +# define DLPACK_DLL __declspec(dllimport) +# endif +#else +# define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ +typedef struct { + /*! \brief DLPack major version. */ + uint32_t major; + /*! \brief DLPack minor version. */ + uint32_t minor; +} DLPackVersion; + +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus +typedef enum : int32_t { +#else +typedef enum { +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, + * the underlying storage size of bool is 8 bits) + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + * + * \note This data structure is used as Legacy DLManagedTensor + * in DLPack exchange and is deprecated after DLPack v0.8 + * Use DLManagedTensorVersioned instead. + * This data structure may get renamed or deleted in future versions. + * + * \sa DLManagedTensorVersioned + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void* manager_ctx; + /*! + * \brief Destructor - this should be called + * to destruct the manager_ctx which backs the DLManagedTensor. It can be + * NULL if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor* self); +} DLManagedTensor; + +// bit masks used in in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief A versioned and managed C Tensor object, manage memory of DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor by + * another framework. It is not meant to transfer the tensor. When the borrowing + * framework doesn't need the tensor, it should call the deleter to notify the + * host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ +struct DLManagedTensorVersioned { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void* manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the + * DLManagedTensorVersioned. It can be NULL if there is no way for the caller to provide + * a reasonable destructor. The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned* self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; +}; + +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/helpers.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..6e4df17f43ebfe238484056fedbd4e6d575460f0 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/c/helpers.h @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/c/abi.h" + +#define ARROW_C_ASSERT(condition, msg) \ + do { \ + if (!(condition)) { \ + fprintf(stderr, "%s:%d:: %s", __FILE__, __LINE__, (msg)); \ + abort(); \ + } \ + } while (0) + +#ifdef __cplusplus +extern "C" { +#endif + +/// Query whether the C schema is released +inline int ArrowSchemaIsReleased(const struct ArrowSchema* schema) { + return schema->release == NULL; +} + +/// Mark the C schema released (for use in release callbacks) +inline void ArrowSchemaMarkReleased(struct ArrowSchema* schema) { + schema->release = NULL; +} + +/// Move the C schema from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid schema already, otherwise there +/// will be a memory leak. +inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) { + assert(dest != src); + assert(!ArrowSchemaIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowSchema)); + ArrowSchemaMarkReleased(src); +} + +/// Release the C schema, if necessary, by calling its release callback +inline void ArrowSchemaRelease(struct ArrowSchema* schema) { + if (!ArrowSchemaIsReleased(schema)) { + schema->release(schema); + ARROW_C_ASSERT(ArrowSchemaIsReleased(schema), + "ArrowSchemaRelease did not cleanup release callback"); + } +} + +/// Query whether the C array is released +inline int ArrowArrayIsReleased(const struct ArrowArray* array) { + return array->release == NULL; +} + +inline int ArrowDeviceArrayIsReleased(const struct ArrowDeviceArray* array) { + return ArrowArrayIsReleased(&array->array); +} + +/// Mark the C array released (for use in release callbacks) +inline void ArrowArrayMarkReleased(struct ArrowArray* array) { array->release = NULL; } + +inline void ArrowDeviceArrayMarkReleased(struct ArrowDeviceArray* array) { + ArrowArrayMarkReleased(&array->array); +} + +/// Move the C array from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid array already, otherwise there +/// will be a memory leak. +inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) { + assert(dest != src); + assert(!ArrowArrayIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArray)); + ArrowArrayMarkReleased(src); +} + +inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src, + struct ArrowDeviceArray* dest) { + assert(dest != src); + assert(!ArrowDeviceArrayIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowDeviceArray)); + ArrowDeviceArrayMarkReleased(src); +} + +/// Release the C array, if necessary, by calling its release callback +inline void ArrowArrayRelease(struct ArrowArray* array) { + if (!ArrowArrayIsReleased(array)) { + array->release(array); + ARROW_C_ASSERT(ArrowArrayIsReleased(array), + "ArrowArrayRelease did not cleanup release callback"); + } +} + +inline void ArrowDeviceArrayRelease(struct ArrowDeviceArray* array) { + if (!ArrowDeviceArrayIsReleased(array)) { + array->array.release(&array->array); + ARROW_C_ASSERT(ArrowDeviceArrayIsReleased(array), + "ArrowDeviceArrayRelease did not cleanup release callback"); + } +} + +/// Query whether the C array stream is released +inline int ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) { + return stream->release == NULL; +} + +inline int ArrowDeviceArrayStreamIsReleased(const struct ArrowDeviceArrayStream* stream) { + return stream->release == NULL; +} + +/// Mark the C array stream released (for use in release callbacks) +inline void ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) { + stream->release = NULL; +} + +inline void ArrowDeviceArrayStreamMarkReleased(struct ArrowDeviceArrayStream* stream) { + stream->release = NULL; +} + +/// Move the C array stream from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid stream already, otherwise there +/// will be a memory leak. +inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dest) { + assert(dest != src); + assert(!ArrowArrayStreamIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArrayStream)); + ArrowArrayStreamMarkReleased(src); +} + +inline void ArrowDeviceArrayStreamMove(struct ArrowDeviceArrayStream* src, + struct ArrowDeviceArrayStream* dest) { + assert(dest != src); + assert(!ArrowDeviceArrayStreamIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowDeviceArrayStream)); + ArrowDeviceArrayStreamMarkReleased(src); +} + +/// Release the C array stream, if necessary, by calling its release callback +inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) { + if (!ArrowArrayStreamIsReleased(stream)) { + stream->release(stream); + ARROW_C_ASSERT(ArrowArrayStreamIsReleased(stream), + "ArrowArrayStreamRelease did not cleanup release callback"); + } +} + +inline void ArrowDeviceArrayStreamRelease(struct ArrowDeviceArrayStream* stream) { + if (!ArrowDeviceArrayStreamIsReleased(stream)) { + stream->release(stream); + ARROW_C_ASSERT(ArrowDeviceArrayStreamIsReleased(stream), + "ArrowDeviceArrayStreamRelease did not cleanup release callback"); + } +} + +#ifdef __cplusplus +} +#endif diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/chunk_resolver.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/chunk_resolver.h new file mode 100644 index 0000000000000000000000000000000000000000..4a5e27c05361fbf64caf6e4e6018f4c70d901f05 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/chunk_resolver.h @@ -0,0 +1,284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" + +namespace arrow::internal { + +struct ChunkResolver; + +template +struct TypedChunkLocation { + /// \brief Index of the chunk in the array of chunks + /// + /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used + /// to represent out-of-bounds locations. + IndexType chunk_index = 0; + + /// \brief Index of the value in the chunk + /// + /// The value is UNDEFINED if chunk_index >= chunks.size() + IndexType index_in_chunk = 0; + + TypedChunkLocation() = default; + + TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk) + : chunk_index(chunk_index), index_in_chunk(index_in_chunk) { + static_assert(sizeof(TypedChunkLocation) == 2 * sizeof(IndexType)); + static_assert(alignof(TypedChunkLocation) == alignof(IndexType)); + } + + bool operator==(TypedChunkLocation other) const { + return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk; + } +}; + +using ChunkLocation = TypedChunkLocation; + +/// \brief An utility that incrementally resolves logical indices into +/// physical indices in a chunked array. +struct ARROW_EXPORT ChunkResolver { + private: + /// \brief Array containing `chunks.size() + 1` offsets. + /// + /// `offsets_[i]` is the starting logical index of chunk `i`. `offsets_[0]` is always 0 + /// and `offsets_[chunks.size()]` is the logical length of the chunked array. + std::vector offsets_; + + /// \brief Cache of the index of the last resolved chunk. + /// + /// \invariant `cached_chunk_ in [0, chunks.size()]` + mutable std::atomic cached_chunk_; + + public: + explicit ChunkResolver(const ArrayVector& chunks) noexcept; + explicit ChunkResolver(const std::vector& chunks) noexcept; + explicit ChunkResolver(const RecordBatchVector& batches) noexcept; + + /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. + /// + /// The first offset must be 0 and the last offset must be the logical length of the + /// chunked array. Each offset before the last represents the starting logical index of + /// the corresponding chunk. + explicit ChunkResolver(std::vector offsets) noexcept + : offsets_(std::move(offsets)), cached_chunk_(0) { +#ifndef NDEBUG + assert(offsets_.size() >= 1); + assert(offsets_[0] == 0); + for (size_t i = 1; i < offsets_.size(); i++) { + assert(offsets_[i] >= offsets_[i - 1]); + } + assert(offsets_.size() - 1 <= + static_cast(std::numeric_limits::max())); +#endif + } + + ChunkResolver(ChunkResolver&& other) noexcept; + ChunkResolver& operator=(ChunkResolver&& other) noexcept; + + ChunkResolver(const ChunkResolver& other) noexcept; + ChunkResolver& operator=(const ChunkResolver& other) noexcept; + + int64_t logical_array_length() const { return offsets_.back(); } + int32_t num_chunks() const { return static_cast(offsets_.size() - 1); } + + int64_t chunk_length(int64_t chunk_index) const { + return offsets_[chunk_index + 1] - offsets_[chunk_index]; + } + + /// \brief Resolve a logical index to a ChunkLocation. + /// + /// The returned ChunkLocation contains the chunk index and the within-chunk index + /// equivalent to the logical index. + /// + /// \pre index >= 0 + /// \post location.chunk_index in [0, chunks.size()] + /// \param index The logical index to resolve + /// \return ChunkLocation with a valid chunk_index if index is within + /// bounds, or with chunk_index == chunks.size() if logical index is + /// `>= chunked_array.length()`. + inline ChunkLocation Resolve(int64_t index) const { + const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed); + const auto chunk_index = + ResolveChunkIndex(index, cached_chunk); + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; + } + + /// \brief Resolve a logical index to a ChunkLocation. + /// + /// The returned ChunkLocation contains the chunk index and the within-chunk index + /// equivalent to the logical index. + /// + /// \pre index >= 0 + /// \post location.chunk_index in [0, chunks.size()] + /// \param index The logical index to resolve + /// \param hint ChunkLocation{} or the last ChunkLocation returned by + /// this ChunkResolver. + /// \return ChunkLocation with a valid chunk_index if index is within + /// bounds, or with chunk_index == chunks.size() if logical index is + /// `>= chunked_array.length()`. + inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const { + assert(hint.chunk_index < static_cast(offsets_.size())); + const auto chunk_index = ResolveChunkIndex( + index, static_cast(hint.chunk_index)); + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; + } + + /// \brief Resolve `n_indices` logical indices to chunk indices. + /// + /// \pre 0 <= logical_index_vec[i] < logical_array_length() + /// (for well-defined and valid chunk index results) + /// \pre out_chunk_location_vec has space for `n_indices` locations + /// \pre chunk_hint in [0, chunks.size()] + /// \post out_chunk_location_vec[i].chunk_index in [0, chunks.size()] for i in [0, n) + /// \post if logical_index_vec[i] >= chunked_array.length(), then + /// out_chunk_location_vec[i].chunk_index == chunks.size() + /// and out_chunk_location_vec[i].index_in_chunk is UNDEFINED (can be + /// out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both values in out_chunk_index_vec[i] + /// are UNDEFINED + /// + /// \param n_indices The number of logical indices to resolve + /// \param logical_index_vec The logical indices to resolve + /// \param out_chunk_location_vec The output array where the locations will be written + /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany + /// \return false iff chunks.size() > std::numeric_limits::max() + template + [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, + TypedChunkLocation* out_chunk_location_vec, + IndexType chunk_hint = 0) const { + if constexpr (sizeof(IndexType) < sizeof(uint32_t)) { + // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). + constexpr int64_t kMaxIndexTypeValue = std::numeric_limits::max(); + // A ChunkedArray with enough empty chunks can make the index of a chunk + // exceed the logical index and thus the maximum value of IndexType. + const bool chunk_index_fits_on_type = num_chunks() <= kMaxIndexTypeValue; + if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) { + return false; + } + // Since an index-in-chunk cannot possibly exceed the logical index being + // queried, we don't have to worry about these values not fitting on IndexType. + } + if constexpr (std::is_signed_v) { + // We interpret signed integers as unsigned and avoid having to generate double + // the amount of binary code to handle each integer width. + // + // Negative logical indices can become large values when cast to unsigned, and + // they are gracefully handled by ResolveManyImpl, but both the chunk index + // and the index in chunk values will be undefined in these cases. This + // happend because int8_t(-1) == uint8_t(255) and 255 could be a valid + // logical index in the chunked array. + using U = std::make_unsigned_t; + ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), + reinterpret_cast*>(out_chunk_location_vec), + static_cast(chunk_hint)); + } else { + static_assert(std::is_unsigned_v); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec, + static_cast(chunk_hint)); + } + return true; + } + + private: + template + inline int64_t ResolveChunkIndex(int64_t index, int32_t cached_chunk) const { + // It is common for algorithms sequentially processing arrays to make consecutive + // accesses at a relatively small distance from each other, hence often falling in the + // same chunk. + // + // This is guaranteed when merging (assuming each side of the merge uses its + // own resolver), and is the most common case in recursive invocations of + // partitioning. + const auto num_offsets = static_cast(offsets_.size()); + const int64_t* offsets = offsets_.data(); + if (ARROW_PREDICT_TRUE(index >= offsets[cached_chunk]) && + (static_cast(cached_chunk + 1) == num_offsets || + index < offsets[cached_chunk + 1])) { + return cached_chunk; + } + // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` + const auto chunk_index = Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); + if constexpr (StoreCachedChunk) { + assert(static_cast(chunk_index) < static_cast(offsets_.size())); + cached_chunk_.store(chunk_index, std::memory_order_relaxed); + } + return chunk_index; + } + + /// \pre all the pre-conditions of ChunkResolver::ResolveMany() + /// \pre num_offsets - 1 <= std::numeric_limits::max() + void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation*, + int32_t) const; + void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation*, + int32_t) const; + void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation*, + int32_t) const; + void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation*, + int32_t) const; + + public: + /// \brief Find the index of the chunk that contains the logical index. + /// + /// Any non-negative index is accepted. When `hi=num_offsets`, the largest + /// possible return value is `num_offsets-1` which is equal to + /// `chunks.size()`. Which is returned when the logical index is greater or + /// equal the logical length of the chunked array. + /// + /// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned) + /// \pre lo < hi + /// \pre lo >= 0 && hi <= offsets_.size() + static inline int32_t Bisect(int64_t index, const int64_t* offsets, int32_t lo, + int32_t hi) { + return Bisect(static_cast(index), + reinterpret_cast(offsets), static_cast(lo), + static_cast(hi)); + } + + static inline int32_t Bisect(uint64_t index, const uint64_t* offsets, uint32_t lo, + uint32_t hi) { + // Similar to std::upper_bound(), but slightly different as our offsets + // array always starts with 0. + auto n = hi - lo; + // First iteration does not need to check for n > 1 + // (lo < hi is guaranteed by the precondition). + assert(n > 1 && "lo < hi is a precondition of Bisect"); + do { + const uint32_t m = n >> 1; + const uint32_t mid = lo + m; + if (index >= offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } while (n > 1); + return lo; + } +}; + +} // namespace arrow::internal diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/chunked_array.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/chunked_array.h new file mode 100644 index 0000000000000000000000000000000000000000..c65b6cb6e227fcde68b2de0bf6ba391f9bb4cc5a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/chunked_array.h @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/chunk_resolver.h" +#include "arrow/compare.h" +#include "arrow/device_allocation_type_set.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class MemoryPool; +namespace stl { +template +class ChunkedArrayIterator; +} // namespace stl + +/// \class ChunkedArray +/// \brief A data structure managing a list of primitive Arrow arrays logically +/// as one large array +/// +/// Data chunking is treated throughout this project largely as an +/// implementation detail for performance and memory use optimization. +/// ChunkedArray allows Array objects to be collected and interpreted +/// as a single logical array without requiring an expensive concatenation +/// step. +/// +/// In some cases, data produced by a function may exceed the capacity of an +/// Array (like BinaryArray or StringArray) and so returning multiple Arrays is +/// the only possibility. In these cases, we recommend returning a ChunkedArray +/// instead of vector of Arrays or some alternative. +/// +/// When data is processed in parallel, it may not be practical or possible to +/// create large contiguous memory allocations and write output into them. With +/// some data types, like binary and string types, it is not possible at all to +/// produce non-chunked array outputs without requiring a concatenation step at +/// the end of processing. +/// +/// Application developers may tune chunk sizes based on analysis of +/// performance profiles but many developer-users will not need to be +/// especially concerned with the chunking details. +/// +/// Preserving the chunk layout/sizes in processing steps is generally not +/// considered to be a contract in APIs. A function may decide to alter the +/// chunking of its result. Similarly, APIs accepting multiple ChunkedArray +/// inputs should not expect the chunk layout to be the same in each input. +class ARROW_EXPORT ChunkedArray { + public: + ChunkedArray(ChunkedArray&&) = default; + ChunkedArray& operator=(ChunkedArray&&) = default; + + /// \brief Construct a chunked array from a single Array + explicit ChunkedArray(std::shared_ptr chunk) + : ChunkedArray(ArrayVector{std::move(chunk)}) {} + + /// \brief Construct a chunked array from a vector of arrays and an optional data type + /// + /// The vector elements must have the same data type. + /// If the data type is passed explicitly, the vector may be empty. + /// If the data type is omitted, the vector must be non-empty. + explicit ChunkedArray(ArrayVector chunks, std::shared_ptr type = NULLPTR); + + // \brief Constructor with basic input validation. + static Result> Make( + ArrayVector chunks, std::shared_ptr type = NULLPTR); + + /// \brief Create an empty ChunkedArray of a given type + /// + /// The output ChunkedArray will have one chunk with an empty + /// array of the given type. + /// + /// \param[in] type the data type of the empty ChunkedArray + /// \param[in] pool the memory pool to allocate memory from + /// \return the resulting ChunkedArray + static Result> MakeEmpty( + std::shared_ptr type, MemoryPool* pool = default_memory_pool()); + + /// \return the total length of the chunked array; computed on construction + int64_t length() const { return length_; } + + /// \return the total number of nulls among all chunks + int64_t null_count() const { return null_count_; } + + /// \return the total number of chunks in the chunked array + int num_chunks() const { return static_cast(chunks_.size()); } + + /// \return chunk a particular chunk from the chunked array + const std::shared_ptr& chunk(int i) const { return chunks_[i]; } + + /// \return an ArrayVector of chunks + const ArrayVector& chunks() const { return chunks_; } + + /// \return The set of device allocation types used by the chunks in this + /// chunked array. + DeviceAllocationTypeSet device_types() const; + + /// \return true if all chunks are allocated on CPU-accessible memory. + bool is_cpu() const { return device_types().is_cpu_only(); } + + /// \brief Construct a zero-copy slice of the chunked array with the + /// indicated offset and length + /// + /// \param[in] offset the position of the first element in the constructed + /// slice + /// \param[in] length the length of the slice. If there are not enough + /// elements in the chunked array, the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr + std::shared_ptr Slice(int64_t offset, int64_t length) const; + + /// \brief Slice from offset until end of the chunked array + std::shared_ptr Slice(int64_t offset) const; + + /// \brief Flatten this chunked array as a vector of chunked arrays, one + /// for each struct field + /// + /// \param[in] pool The pool for buffer allocations, if any + Result>> Flatten( + MemoryPool* pool = default_memory_pool()) const; + + /// Construct a zero-copy view of this chunked array with the given + /// type. Calls Array::View on each constituent chunk. Always succeeds if + /// there are zero chunks + Result> View(const std::shared_ptr& type) const; + + /// \brief Return the type of the chunked array + const std::shared_ptr& type() const { return type_; } + + /// \brief Return a Scalar containing the value of this array at index + Result> GetScalar(int64_t index) const; + + /// \brief Determine if two chunked arrays are equal. + /// + /// Two chunked arrays can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. + bool Equals(const ChunkedArray& other, + const EqualOptions& opts = EqualOptions::Defaults()) const; + /// \brief Determine if two chunked arrays are equal. + bool Equals(const std::shared_ptr& other, + const EqualOptions& opts = EqualOptions::Defaults()) const; + /// \brief Determine if two chunked arrays approximately equal + bool ApproxEquals(const ChunkedArray& other, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// \return PrettyPrint representation suitable for debugging + std::string ToString() const; + + /// \brief Perform cheap validation checks to determine obvious inconsistencies + /// within the chunk array's internal data. + /// + /// This is O(k*m) where k is the number of array descendents, + /// and m is the number of chunks. + /// + /// \return Status + Status Validate() const; + + /// \brief Perform extensive validation checks to determine inconsistencies + /// within the chunk array's internal data. + /// + /// This is O(k*n) where k is the number of array descendents, + /// and n is the length in elements. + /// + /// \return Status + Status ValidateFull() const; + + protected: + ArrayVector chunks_; + std::shared_ptr type_; + int64_t length_; + int64_t null_count_; + + private: + template + friend class ::arrow::stl::ChunkedArrayIterator; + internal::ChunkResolver chunk_resolver_; + ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); +}; + +namespace internal { + +/// \brief EXPERIMENTAL: Utility for incremental iteration over contiguous +/// pieces of potentially differently-chunked ChunkedArray objects +class ARROW_EXPORT MultipleChunkIterator { + public: + MultipleChunkIterator(const ChunkedArray& left, const ChunkedArray& right) + : left_(left), + right_(right), + pos_(0), + length_(left.length()), + chunk_idx_left_(0), + chunk_idx_right_(0), + chunk_pos_left_(0), + chunk_pos_right_(0) {} + + bool Next(std::shared_ptr* next_left, std::shared_ptr* next_right); + + int64_t position() const { return pos_; } + + private: + const ChunkedArray& left_; + const ChunkedArray& right_; + + // The amount of the entire ChunkedArray consumed + int64_t pos_; + + // Length of the chunked array(s) + int64_t length_; + + // Current left chunk + int chunk_idx_left_; + + // Current right chunk + int chunk_idx_right_; + + // Offset into the current left chunk + int64_t chunk_pos_left_; + + // Offset into the current right chunk + int64_t chunk_pos_right_; +}; + +/// \brief Evaluate binary function on two ChunkedArray objects having possibly +/// different chunk layouts. The passed binary function / functor should have +/// the following signature. +/// +/// Status(const Array&, const Array&, int64_t) +/// +/// The third argument is the absolute position relative to the start of each +/// ChunkedArray. The function is executed against each contiguous pair of +/// array segments, slicing if necessary. +/// +/// For example, if two arrays have chunk sizes +/// +/// left: [10, 10, 20] +/// right: [15, 10, 15] +/// +/// Then the following invocations take place (pseudocode) +/// +/// func(left.chunk[0][0:10], right.chunk[0][0:10], 0) +/// func(left.chunk[1][0:5], right.chunk[0][10:15], 10) +/// func(left.chunk[1][5:10], right.chunk[1][0:5], 15) +/// func(left.chunk[2][0:5], right.chunk[1][5:10], 20) +/// func(left.chunk[2][5:20], right.chunk[2][:], 25) +template +Status ApplyBinaryChunked(const ChunkedArray& left, const ChunkedArray& right, + Action&& action) { + MultipleChunkIterator iterator(left, right); + std::shared_ptr left_piece, right_piece; + while (iterator.Next(&left_piece, &right_piece)) { + ARROW_RETURN_NOT_OK(action(*left_piece, *right_piece, iterator.position())); + } + return Status::OK(); +} + +} // namespace internal +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compare.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compare.h new file mode 100644 index 0000000000000000000000000000000000000000..6dbacfa86af592c1e2aecf22aea2322ce5bc5090 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compare.h @@ -0,0 +1,145 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for comparing Arrow data structures + +#pragma once + +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class Tensor; +class SparseTensor; +struct Scalar; + +static constexpr double kDefaultAbsoluteTolerance = 1E-5; + +/// A container of options for equality comparisons +class EqualOptions { + public: + /// Whether or not NaNs are considered equal. + bool nans_equal() const { return nans_equal_; } + + /// Return a new EqualOptions object with the "nans_equal" property changed. + EqualOptions nans_equal(bool v) const { + auto res = EqualOptions(*this); + res.nans_equal_ = v; + return res; + } + + /// Whether or not zeros with differing signs are considered equal. + bool signed_zeros_equal() const { return signed_zeros_equal_; } + + /// Return a new EqualOptions object with the "signed_zeros_equal" property changed. + EqualOptions signed_zeros_equal(bool v) const { + auto res = EqualOptions(*this); + res.signed_zeros_equal_ = v; + return res; + } + + /// The absolute tolerance for approximate comparisons of floating-point values. + double atol() const { return atol_; } + + /// Return a new EqualOptions object with the "atol" property changed. + EqualOptions atol(double v) const { + auto res = EqualOptions(*this); + res.atol_ = v; + return res; + } + + /// The ostream to which a diff will be formatted if arrays disagree. + /// If this is null (the default) no diff will be formatted. + std::ostream* diff_sink() const { return diff_sink_; } + + /// Return a new EqualOptions object with the "diff_sink" property changed. + /// This option will be ignored if diff formatting of the types of compared arrays is + /// not supported. + EqualOptions diff_sink(std::ostream* diff_sink) const { + auto res = EqualOptions(*this); + res.diff_sink_ = diff_sink; + return res; + } + + static EqualOptions Defaults() { return {}; } + + protected: + double atol_ = kDefaultAbsoluteTolerance; + bool nans_equal_ = false; + bool signed_zeros_equal_ = true; + + std::ostream* diff_sink_ = NULLPTR; +}; + +/// Returns true if the arrays are exactly equal +ARROW_EXPORT bool ArrayEquals(const Array& left, const Array& right, + const EqualOptions& = EqualOptions::Defaults()); + +/// Returns true if the arrays are approximately equal. For non-floating point +/// types, this is equivalent to ArrayEquals(left, right) +ARROW_EXPORT bool ArrayApproxEquals(const Array& left, const Array& right, + const EqualOptions& = EqualOptions::Defaults()); + +/// Returns true if indicated equal-length segment of arrays are exactly equal +ARROW_EXPORT bool ArrayRangeEquals(const Array& left, const Array& right, + int64_t start_idx, int64_t end_idx, + int64_t other_start_idx, + const EqualOptions& = EqualOptions::Defaults()); + +/// Returns true if indicated equal-length segment of arrays are approximately equal +ARROW_EXPORT bool ArrayRangeApproxEquals(const Array& left, const Array& right, + int64_t start_idx, int64_t end_idx, + int64_t other_start_idx, + const EqualOptions& = EqualOptions::Defaults()); + +ARROW_EXPORT bool TensorEquals(const Tensor& left, const Tensor& right, + const EqualOptions& = EqualOptions::Defaults()); + +/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal +ARROW_EXPORT bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right, + const EqualOptions& = EqualOptions::Defaults()); + +/// Returns true if the type metadata are exactly equal +/// \param[in] left a DataType +/// \param[in] right a DataType +/// \param[in] check_metadata whether to compare KeyValueMetadata for child +/// fields +ARROW_EXPORT bool TypeEquals(const DataType& left, const DataType& right, + bool check_metadata = true); + +/// Returns true if scalars are equal +/// \param[in] left a Scalar +/// \param[in] right a Scalar +/// \param[in] options comparison options +ARROW_EXPORT bool ScalarEquals(const Scalar& left, const Scalar& right, + const EqualOptions& options = EqualOptions::Defaults()); + +/// Returns true if scalars are approximately equal +/// \param[in] left a Scalar +/// \param[in] right a Scalar +/// \param[in] options comparison options +ARROW_EXPORT bool ScalarApproxEquals( + const Scalar& left, const Scalar& right, + const EqualOptions& options = EqualOptions::Defaults()); + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api.h new file mode 100644 index 0000000000000000000000000000000000000000..b701d9928691f42b70a201569feb27d5ea86f8cd --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle + +#pragma once + +/// \defgroup compute-functions Abstract compute function API +/// @{ +/// @} + +/// \defgroup compute-concrete-options Concrete option classes for compute functions +/// @{ +/// @} + +#include "arrow/compute/api_aggregate.h" // IWYU pragma: export +#include "arrow/compute/api_scalar.h" // IWYU pragma: export +#include "arrow/compute/api_vector.h" // IWYU pragma: export +#include "arrow/compute/cast.h" // IWYU pragma: export +#include "arrow/compute/function.h" // IWYU pragma: export +#include "arrow/compute/function_options.h" // IWYU pragma: export +#include "arrow/compute/kernel.h" // IWYU pragma: export +#include "arrow/compute/registry.h" // IWYU pragma: export +#include "arrow/datum.h" // IWYU pragma: export + +#include "arrow/compute/expression.h" // IWYU pragma: export + +/// \defgroup execnode-row Utilities for working with data in a row-major format +/// @{ +/// @} + +#include "arrow/compute/row/grouper.h" // IWYU pragma: export + +/// \defgroup acero-internals Acero internals, useful for those extending Acero +/// @{ +/// @} + +#include "arrow/compute/exec.h" // IWYU pragma: export diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_aggregate.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_aggregate.h new file mode 100644 index 0000000000000000000000000000000000000000..2e5210b073ee4218145646bc512e06a9a0d3df6a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_aggregate.h @@ -0,0 +1,466 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Eager evaluation convenience APIs for invoking common functions, including +// necessary memory allocations + +#pragma once + +#include + +#include "arrow/compute/function_options.h" +#include "arrow/datum.h" +#include "arrow/result.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; + +namespace compute { + +class ExecContext; + +// ---------------------------------------------------------------------- +// Aggregate functions + +/// \addtogroup compute-concrete-options +/// @{ + +/// \brief Control general scalar aggregate kernel behavior +/// +/// By default, null values are ignored (skip_nulls = true). +class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { + public: + explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1); + static constexpr char const kTypeName[] = "ScalarAggregateOptions"; + static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; } + + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; +}; + +/// \brief Control count aggregate kernel behavior. +/// +/// By default, only non-null values are counted. +class ARROW_EXPORT CountOptions : public FunctionOptions { + public: + enum CountMode { + /// Count only non-null values. + ONLY_VALID = 0, + /// Count only null values. + ONLY_NULL, + /// Count both non-null and null values. + ALL, + }; + explicit CountOptions(CountMode mode = CountMode::ONLY_VALID); + static constexpr char const kTypeName[] = "CountOptions"; + static CountOptions Defaults() { return CountOptions{}; } + + CountMode mode; +}; + +/// \brief Control Mode kernel behavior +/// +/// Returns top-n common values and counts. +/// By default, returns the most common value and count. +class ARROW_EXPORT ModeOptions : public FunctionOptions { + public: + explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0); + static constexpr char const kTypeName[] = "ModeOptions"; + static ModeOptions Defaults() { return ModeOptions{}; } + + int64_t n = 1; + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; +}; + +/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel +/// +/// The divisor used in calculations is N - ddof, where N is the number of elements. +/// By default, ddof is zero, and population variance or stddev is returned. +class ARROW_EXPORT VarianceOptions : public FunctionOptions { + public: + explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0); + static constexpr char const kTypeName[] = "VarianceOptions"; + static VarianceOptions Defaults() { return VarianceOptions{}; } + + int ddof = 0; + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; +}; + +/// \brief Control Quantile kernel behavior +/// +/// By default, returns the median value. +class ARROW_EXPORT QuantileOptions : public FunctionOptions { + public: + /// Interpolation method to use when quantile lies between two data points + enum Interpolation { + LINEAR = 0, + LOWER, + HIGHER, + NEAREST, + MIDPOINT, + }; + + explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR, + bool skip_nulls = true, uint32_t min_count = 0); + + explicit QuantileOptions(std::vector q, + enum Interpolation interpolation = LINEAR, + bool skip_nulls = true, uint32_t min_count = 0); + + static constexpr char const kTypeName[] = "QuantileOptions"; + static QuantileOptions Defaults() { return QuantileOptions{}; } + + /// probability level of quantile must be between 0 and 1 inclusive + std::vector q; + enum Interpolation interpolation; + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; +}; + +/// \brief Control TDigest approximate quantile kernel behavior +/// +/// By default, returns the median value. +class ARROW_EXPORT TDigestOptions : public FunctionOptions { + public: + explicit TDigestOptions(double q = 0.5, uint32_t delta = 100, + uint32_t buffer_size = 500, bool skip_nulls = true, + uint32_t min_count = 0); + explicit TDigestOptions(std::vector q, uint32_t delta = 100, + uint32_t buffer_size = 500, bool skip_nulls = true, + uint32_t min_count = 0); + static constexpr char const kTypeName[] = "TDigestOptions"; + static TDigestOptions Defaults() { return TDigestOptions{}; } + + /// probability level of quantile must be between 0 and 1 inclusive + std::vector q; + /// compression parameter, default 100 + uint32_t delta; + /// input buffer size, default 500 + uint32_t buffer_size; + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; +}; + +/// \brief Control Index kernel behavior +class ARROW_EXPORT IndexOptions : public FunctionOptions { + public: + explicit IndexOptions(std::shared_ptr value); + // Default constructor for serialization + IndexOptions(); + static constexpr char const kTypeName[] = "IndexOptions"; + + std::shared_ptr value; +}; + +/// \brief Configure a grouped aggregation +struct ARROW_EXPORT Aggregate { + Aggregate() = default; + + Aggregate(std::string function, std::shared_ptr options, + std::vector target, std::string name = "") + : function(std::move(function)), + options(std::move(options)), + target(std::move(target)), + name(std::move(name)) {} + + Aggregate(std::string function, std::shared_ptr options, + FieldRef target, std::string name = "") + : Aggregate(std::move(function), std::move(options), + std::vector{std::move(target)}, std::move(name)) {} + + Aggregate(std::string function, FieldRef target, std::string name) + : Aggregate(std::move(function), /*options=*/NULLPTR, + std::vector{std::move(target)}, std::move(name)) {} + + Aggregate(std::string function, std::string name) + : Aggregate(std::move(function), /*options=*/NULLPTR, + /*target=*/std::vector{}, std::move(name)) {} + + /// the name of the aggregation function + std::string function; + + /// options for the aggregation function + std::shared_ptr options; + + /// zero or more fields to which aggregations will be applied + std::vector target; + + /// optional output field name for aggregations + std::string name; +}; + +/// @} + +/// \brief Count values in an array. +/// +/// \param[in] options counting options, see CountOptions for more information +/// \param[in] datum to count +/// \param[in] ctx the function execution context, optional +/// \return out resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Count(const Datum& datum, + const CountOptions& options = CountOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the mean of a numeric array. +/// +/// \param[in] value datum to compute the mean, expecting Array +/// \param[in] options see ScalarAggregateOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed mean as a DoubleScalar +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Mean( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the product of values of a numeric array. +/// +/// \param[in] value datum to compute product of, expecting Array or ChunkedArray +/// \param[in] options see ScalarAggregateOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed sum as a Scalar +/// +/// \since 6.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Product( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Sum values of a numeric array. +/// +/// \param[in] value datum to sum, expecting Array or ChunkedArray +/// \param[in] options see ScalarAggregateOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed sum as a Scalar +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Sum( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the first value of an array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see ScalarAggregateOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed first as Scalar +/// +/// \since 13.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result First( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the last value of an array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see ScalarAggregateOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed last as a Scalar +/// +/// \since 13.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Last( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the min / max of a numeric array +/// +/// This function returns both the min and max as a struct scalar, with type +/// struct, where T is the input type +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see ScalarAggregateOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return resulting datum as a struct scalar +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result MinMax( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Test whether any element in a boolean array evaluates to true. +/// +/// This function returns true if any of the elements in the array evaluates +/// to true and false otherwise. Null values are ignored by default. +/// If null values are taken into account by setting ScalarAggregateOptions +/// parameter skip_nulls = false then Kleene logic is used. +/// See KleeneOr for more details on Kleene logic. +/// +/// \param[in] value input datum, expecting a boolean array +/// \param[in] options see ScalarAggregateOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return resulting datum as a BooleanScalar +/// +/// \since 3.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Any( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Test whether all elements in a boolean array evaluate to true. +/// +/// This function returns true if all of the elements in the array evaluate +/// to true and false otherwise. Null values are ignored by default. +/// If null values are taken into account by setting ScalarAggregateOptions +/// parameter skip_nulls = false then Kleene logic is used. +/// See KleeneAnd for more details on Kleene logic. +/// +/// \param[in] value input datum, expecting a boolean array +/// \param[in] options see ScalarAggregateOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return resulting datum as a BooleanScalar + +/// \since 3.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result All( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the modal (most common) value of a numeric array +/// +/// This function returns top-n most common values and number of times they occur as +/// an array of `struct`, where T is the input type. +/// Values with larger counts are returned before smaller ones. +/// If there are more than one values with same count, smaller value is returned first. +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see ModeOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return resulting datum as an array of struct +/// +/// \since 2.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Mode(const Datum& value, + const ModeOptions& options = ModeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the standard deviation of a numeric array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see VarianceOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed standard deviation as a DoubleScalar +/// +/// \since 2.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Stddev(const Datum& value, + const VarianceOptions& options = VarianceOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the variance of a numeric array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see VarianceOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed variance as a DoubleScalar +/// +/// \since 2.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Variance(const Datum& value, + const VarianceOptions& options = VarianceOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the quantiles of a numeric array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see QuantileOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return resulting datum as an array +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Quantile(const Datum& value, + const QuantileOptions& options = QuantileOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see TDigestOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return resulting datum as an array +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result TDigest(const Datum& value, + const TDigestOptions& options = TDigestOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Find the first index of a value in an array. +/// +/// \param[in] value The array to search. +/// \param[in] options The array to search for. See IndexOptions. +/// \param[in] ctx the function execution context, optional +/// \return out a Scalar containing the index (or -1 if not found). +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Index(const Datum& value, const IndexOptions& options, + ExecContext* ctx = NULLPTR); + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_scalar.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_scalar.h new file mode 100644 index 0000000000000000000000000000000000000000..947474e5962d0198d78335fc8d22189055dffe00 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_scalar.h @@ -0,0 +1,1722 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Eager evaluation convenience APIs for invoking common functions, including +// necessary memory allocations + +#pragma once + +#include +#include +#include + +#include "arrow/compute/function_options.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/datum.h" +#include "arrow/result.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +/// \addtogroup compute-concrete-options +/// +/// @{ + +class ARROW_EXPORT ArithmeticOptions : public FunctionOptions { + public: + explicit ArithmeticOptions(bool check_overflow = false); + static constexpr char const kTypeName[] = "ArithmeticOptions"; + bool check_overflow; +}; + +class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions { + public: + explicit ElementWiseAggregateOptions(bool skip_nulls = true); + static constexpr char const kTypeName[] = "ElementWiseAggregateOptions"; + static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; } + bool skip_nulls; +}; + +/// Rounding and tie-breaking modes for round compute functions. +/// Additional details and examples are provided in compute.rst. +enum class RoundMode : int8_t { + /// Round to nearest integer less than or equal in magnitude (aka "floor") + DOWN, + /// Round to nearest integer greater than or equal in magnitude (aka "ceil") + UP, + /// Get the integral part without fractional digits (aka "trunc") + TOWARDS_ZERO, + /// Round negative values with DOWN rule + /// and positive values with UP rule (aka "away from zero") + TOWARDS_INFINITY, + /// Round ties with DOWN rule (also called "round half towards negative infinity") + HALF_DOWN, + /// Round ties with UP rule (also called "round half towards positive infinity") + HALF_UP, + /// Round ties with TOWARDS_ZERO rule (also called "round half away from infinity") + HALF_TOWARDS_ZERO, + /// Round ties with TOWARDS_INFINITY rule (also called "round half away from zero") + HALF_TOWARDS_INFINITY, + /// Round ties to nearest even integer + HALF_TO_EVEN, + /// Round ties to nearest odd integer + HALF_TO_ODD, +}; + +class ARROW_EXPORT RoundOptions : public FunctionOptions { + public: + explicit RoundOptions(int64_t ndigits = 0, + RoundMode round_mode = RoundMode::HALF_TO_EVEN); + static constexpr char const kTypeName[] = "RoundOptions"; + static RoundOptions Defaults() { return RoundOptions(); } + /// Rounding precision (number of digits to round to) + int64_t ndigits; + /// Rounding and tie-breaking mode + RoundMode round_mode; +}; + +class ARROW_EXPORT RoundBinaryOptions : public FunctionOptions { + public: + explicit RoundBinaryOptions(RoundMode round_mode = RoundMode::HALF_TO_EVEN); + static constexpr char const kTypeName[] = "RoundBinaryOptions"; + static RoundBinaryOptions Defaults() { return RoundBinaryOptions(); } + /// Rounding and tie-breaking mode + RoundMode round_mode; +}; + +enum class CalendarUnit : int8_t { + NANOSECOND, + MICROSECOND, + MILLISECOND, + SECOND, + MINUTE, + HOUR, + DAY, + WEEK, + MONTH, + QUARTER, + YEAR +}; + +class ARROW_EXPORT RoundTemporalOptions : public FunctionOptions { + public: + explicit RoundTemporalOptions(int multiple = 1, CalendarUnit unit = CalendarUnit::DAY, + bool week_starts_monday = true, + bool ceil_is_strictly_greater = false, + bool calendar_based_origin = false); + static constexpr char const kTypeName[] = "RoundTemporalOptions"; + static RoundTemporalOptions Defaults() { return RoundTemporalOptions(); } + + /// Number of units to round to + int multiple; + /// The unit used for rounding of time + CalendarUnit unit; + /// What day does the week start with (Monday=true, Sunday=false) + bool week_starts_monday; + /// Enable this flag to return a rounded value that is strictly greater than the input. + /// For example: ceiling 1970-01-01T00:00:00 to 3 hours would yield 1970-01-01T03:00:00 + /// if set to true and 1970-01-01T00:00:00 if set to false. + /// This applies for ceiling only. + bool ceil_is_strictly_greater; + /// By default time is rounded to a multiple of units since 1970-01-01T00:00:00. + /// By setting calendar_based_origin to true, time will be rounded to a number + /// of units since the last greater calendar unit. + /// For example: rounding to a multiple of days since the beginning of the month or + /// to hours since the beginning of the day. + /// Exceptions: week and quarter are not used as greater units, therefore days will + /// will be rounded to the beginning of the month not week. Greater unit of week + /// is year. + /// Note that ceiling and rounding might change sorting order of an array near greater + /// unit change. For example rounding YYYY-mm-dd 23:00:00 to 5 hours will ceil and + /// round to YYYY-mm-dd+1 01:00:00 and floor to YYYY-mm-dd 20:00:00. On the other hand + /// YYYY-mm-dd+1 00:00:00 will ceil, round and floor to YYYY-mm-dd+1 00:00:00. This + /// can break the order of an already ordered array. + bool calendar_based_origin; +}; + +class ARROW_EXPORT RoundToMultipleOptions : public FunctionOptions { + public: + explicit RoundToMultipleOptions(double multiple = 1.0, + RoundMode round_mode = RoundMode::HALF_TO_EVEN); + explicit RoundToMultipleOptions(std::shared_ptr multiple, + RoundMode round_mode = RoundMode::HALF_TO_EVEN); + static constexpr char const kTypeName[] = "RoundToMultipleOptions"; + static RoundToMultipleOptions Defaults() { return RoundToMultipleOptions(); } + /// Rounding scale (multiple to round to). + /// + /// Should be a positive numeric scalar of a type compatible with the + /// argument to be rounded. The cast kernel is used to convert the rounding + /// multiple to match the result type. + std::shared_ptr multiple; + /// Rounding and tie-breaking mode + RoundMode round_mode; +}; + +/// Options for var_args_join. +class ARROW_EXPORT JoinOptions : public FunctionOptions { + public: + /// How to handle null values. (A null separator always results in a null output.) + enum NullHandlingBehavior { + /// A null in any input results in a null in the output. + EMIT_NULL, + /// Nulls in inputs are skipped. + SKIP, + /// Nulls in inputs are replaced with the replacement string. + REPLACE, + }; + explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL, + std::string null_replacement = ""); + static constexpr char const kTypeName[] = "JoinOptions"; + static JoinOptions Defaults() { return JoinOptions(); } + NullHandlingBehavior null_handling; + std::string null_replacement; +}; + +class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { + public: + explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false); + MatchSubstringOptions(); + static constexpr char const kTypeName[] = "MatchSubstringOptions"; + + /// The exact substring (or regex, depending on kernel) to look for inside input values. + std::string pattern; + /// Whether to perform a case-insensitive match. + bool ignore_case; +}; + +class ARROW_EXPORT SplitOptions : public FunctionOptions { + public: + explicit SplitOptions(int64_t max_splits = -1, bool reverse = false); + static constexpr char const kTypeName[] = "SplitOptions"; + + /// Maximum number of splits allowed, or unlimited when -1 + int64_t max_splits; + /// Start splitting from the end of the string (only relevant when max_splits != -1) + bool reverse; +}; + +class ARROW_EXPORT SplitPatternOptions : public FunctionOptions { + public: + explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1, + bool reverse = false); + SplitPatternOptions(); + static constexpr char const kTypeName[] = "SplitPatternOptions"; + + /// The exact substring to split on. + std::string pattern; + /// Maximum number of splits allowed, or unlimited when -1 + int64_t max_splits; + /// Start splitting from the end of the string (only relevant when max_splits != -1) + bool reverse; +}; + +class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions { + public: + explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement); + ReplaceSliceOptions(); + static constexpr char const kTypeName[] = "ReplaceSliceOptions"; + + /// Index to start slicing at + int64_t start; + /// Index to stop slicing at + int64_t stop; + /// String to replace the slice with + std::string replacement; +}; + +class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions { + public: + explicit ReplaceSubstringOptions(std::string pattern, std::string replacement, + int64_t max_replacements = -1); + ReplaceSubstringOptions(); + static constexpr char const kTypeName[] = "ReplaceSubstringOptions"; + + /// Pattern to match, literal, or regular expression depending on which kernel is used + std::string pattern; + /// String to replace the pattern with + std::string replacement; + /// Max number of substrings to replace (-1 means unbounded) + int64_t max_replacements; +}; + +class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions { + public: + explicit ExtractRegexOptions(std::string pattern); + ExtractRegexOptions(); + static constexpr char const kTypeName[] = "ExtractRegexOptions"; + + /// Regular expression with named capture fields + std::string pattern; +}; + +/// Options for IsIn and IndexIn functions +class ARROW_EXPORT SetLookupOptions : public FunctionOptions { + public: + /// How to handle null values. + enum NullMatchingBehavior { + /// MATCH, any null in `value_set` is successfully matched in + /// the input. + MATCH, + /// SKIP, any null in `value_set` is ignored and nulls in the input + /// produce null (IndexIn) or false (IsIn) values in the output. + SKIP, + /// EMIT_NULL, any null in `value_set` is ignored and nulls in the + /// input produce null (IndexIn and IsIn) values in the output. + EMIT_NULL, + /// INCONCLUSIVE, null values are regarded as unknown values, which is + /// sql-compatible. nulls in the input produce null (IndexIn and IsIn) + /// values in the output. Besides, if `value_set` contains a null, + /// non-null unmatched values in the input also produce null values + /// (IndexIn and IsIn) in the output. + INCONCLUSIVE + }; + + explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); + SetLookupOptions(); + + // DEPRECATED(will be removed after removing of skip_nulls) + explicit SetLookupOptions(Datum value_set, bool skip_nulls); + + static constexpr char const kTypeName[] = "SetLookupOptions"; + + /// The set of values to look up input values into. + Datum value_set; + + NullMatchingBehavior null_matching_behavior; + + // DEPRECATED(will be removed after removing of skip_nulls) + NullMatchingBehavior GetNullMatchingBehavior() const; + + // DEPRECATED(use null_matching_behavior instead) + /// Whether nulls in `value_set` count for lookup. + /// + /// If true, any null in `value_set` is ignored and nulls in the input + /// produce null (IndexIn) or false (IsIn) values in the output. + /// If false, any null in `value_set` is successfully matched in + /// the input. + std::optional skip_nulls; +}; + +/// Options for struct_field function +class ARROW_EXPORT StructFieldOptions : public FunctionOptions { + public: + explicit StructFieldOptions(std::vector indices); + explicit StructFieldOptions(std::initializer_list); + explicit StructFieldOptions(FieldRef field_ref); + StructFieldOptions(); + static constexpr char const kTypeName[] = "StructFieldOptions"; + + /// The FieldRef specifying what to extract from struct or union. + FieldRef field_ref; +}; + +class ARROW_EXPORT StrptimeOptions : public FunctionOptions { + public: + explicit StrptimeOptions(std::string format, TimeUnit::type unit, + bool error_is_null = false); + StrptimeOptions(); + static constexpr char const kTypeName[] = "StrptimeOptions"; + + /// The desired format string. + std::string format; + /// The desired time resolution + TimeUnit::type unit; + /// Return null on parsing errors if true or raise if false + bool error_is_null; +}; + +class ARROW_EXPORT StrftimeOptions : public FunctionOptions { + public: + explicit StrftimeOptions(std::string format, std::string locale = "C"); + StrftimeOptions(); + + static constexpr char const kTypeName[] = "StrftimeOptions"; + + static constexpr const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%S"; + + /// The desired format string. + std::string format; + /// The desired output locale string. + std::string locale; +}; + +class ARROW_EXPORT PadOptions : public FunctionOptions { + public: + explicit PadOptions(int64_t width, std::string padding = " ", + bool lean_left_on_odd_padding = true); + PadOptions(); + static constexpr char const kTypeName[] = "PadOptions"; + + /// The desired string length. + int64_t width; + /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII). + std::string padding; + /// What to do if there is an odd number of padding characters (in case of centered + /// padding). Defaults to aligning on the left (i.e. adding the extra padding character + /// on the right) + bool lean_left_on_odd_padding = true; +}; + +class ARROW_EXPORT TrimOptions : public FunctionOptions { + public: + explicit TrimOptions(std::string characters); + TrimOptions(); + static constexpr char const kTypeName[] = "TrimOptions"; + + /// The individual characters to be trimmed from the string. + std::string characters; +}; + +class ARROW_EXPORT SliceOptions : public FunctionOptions { + public: + explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits::max(), + int64_t step = 1); + SliceOptions(); + static constexpr char const kTypeName[] = "SliceOptions"; + int64_t start, stop, step; +}; + +class ARROW_EXPORT ListSliceOptions : public FunctionOptions { + public: + explicit ListSliceOptions(int64_t start, std::optional stop = std::nullopt, + int64_t step = 1, + std::optional return_fixed_size_list = std::nullopt); + ListSliceOptions(); + static constexpr char const kTypeName[] = "ListSliceOptions"; + /// The start of list slicing. + int64_t start; + /// Optional stop of list slicing. If not set, then slice to end. (NotImplemented) + std::optional stop; + /// Slicing step + int64_t step; + // Whether to return a FixedSizeListArray. If true _and_ stop is after + // a list element's length, nulls will be appended to create the requested slice size. + // Default of `nullopt` will return whatever type it got in. + std::optional return_fixed_size_list; +}; + +class ARROW_EXPORT NullOptions : public FunctionOptions { + public: + explicit NullOptions(bool nan_is_null = false); + static constexpr char const kTypeName[] = "NullOptions"; + static NullOptions Defaults() { return NullOptions{}; } + + bool nan_is_null; +}; + +enum CompareOperator : int8_t { + EQUAL, + NOT_EQUAL, + GREATER, + GREATER_EQUAL, + LESS, + LESS_EQUAL, +}; + +struct ARROW_EXPORT CompareOptions { + explicit CompareOptions(CompareOperator op) : op(op) {} + CompareOptions() : CompareOptions(CompareOperator::EQUAL) {} + enum CompareOperator op; +}; + +class ARROW_EXPORT MakeStructOptions : public FunctionOptions { + public: + MakeStructOptions(std::vector n, std::vector r, + std::vector> m); + explicit MakeStructOptions(std::vector n); + MakeStructOptions(); + static constexpr char const kTypeName[] = "MakeStructOptions"; + + /// Names for wrapped columns + std::vector field_names; + + /// Nullability bits for wrapped columns + std::vector field_nullability; + + /// Metadata attached to wrapped columns + std::vector> field_metadata; +}; + +struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions { + public: + explicit DayOfWeekOptions(bool count_from_zero = true, uint32_t week_start = 1); + static constexpr char const kTypeName[] = "DayOfWeekOptions"; + static DayOfWeekOptions Defaults() { return DayOfWeekOptions(); } + + /// Number days from 0 if true and from 1 if false + bool count_from_zero; + /// What day does the week start with (Monday=1, Sunday=7). + /// The numbering is unaffected by the count_from_zero parameter. + uint32_t week_start; +}; + +/// Used to control timestamp timezone conversion and handling ambiguous/nonexistent +/// times. +struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions { + public: + /// \brief How to interpret ambiguous local times that can be interpreted as + /// multiple instants (normally two) due to DST shifts. + /// + /// AMBIGUOUS_EARLIEST emits the earliest instant amongst possible interpretations. + /// AMBIGUOUS_LATEST emits the latest instant amongst possible interpretations. + enum Ambiguous { AMBIGUOUS_RAISE, AMBIGUOUS_EARLIEST, AMBIGUOUS_LATEST }; + + /// \brief How to handle local times that do not exist due to DST shifts. + /// + /// NONEXISTENT_EARLIEST emits the instant "just before" the DST shift instant + /// in the given timestamp precision (for example, for a nanoseconds precision + /// timestamp, this is one nanosecond before the DST shift instant). + /// NONEXISTENT_LATEST emits the DST shift instant. + enum Nonexistent { NONEXISTENT_RAISE, NONEXISTENT_EARLIEST, NONEXISTENT_LATEST }; + + explicit AssumeTimezoneOptions(std::string timezone, + Ambiguous ambiguous = AMBIGUOUS_RAISE, + Nonexistent nonexistent = NONEXISTENT_RAISE); + AssumeTimezoneOptions(); + static constexpr char const kTypeName[] = "AssumeTimezoneOptions"; + + /// Timezone to convert timestamps from + std::string timezone; + + /// How to interpret ambiguous local times (due to DST shifts) + Ambiguous ambiguous; + /// How to interpret nonexistent local times (due to DST shifts) + Nonexistent nonexistent; +}; + +struct ARROW_EXPORT WeekOptions : public FunctionOptions { + public: + explicit WeekOptions(bool week_starts_monday = true, bool count_from_zero = false, + bool first_week_is_fully_in_year = false); + static constexpr char const kTypeName[] = "WeekOptions"; + static WeekOptions Defaults() { return WeekOptions{}; } + static WeekOptions ISODefaults() { + return WeekOptions{/*week_starts_monday*/ true, + /*count_from_zero=*/false, + /*first_week_is_fully_in_year=*/false}; + } + static WeekOptions USDefaults() { + return WeekOptions{/*week_starts_monday*/ false, + /*count_from_zero=*/false, + /*first_week_is_fully_in_year=*/false}; + } + + /// What day does the week start with (Monday=true, Sunday=false) + bool week_starts_monday; + /// Dates from current year that fall into last ISO week of the previous year return + /// 0 if true and 52 or 53 if false. + bool count_from_zero; + /// Must the first week be fully in January (true), or is a week that begins on + /// December 29, 30, or 31 considered to be the first week of the new year (false)? + bool first_week_is_fully_in_year; +}; + +struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions { + public: + enum Form { NFC, NFKC, NFD, NFKD }; + + explicit Utf8NormalizeOptions(Form form = NFC); + static Utf8NormalizeOptions Defaults() { return Utf8NormalizeOptions(); } + static constexpr char const kTypeName[] = "Utf8NormalizeOptions"; + + /// The Unicode normalization form to apply + Form form; +}; + +class ARROW_EXPORT RandomOptions : public FunctionOptions { + public: + enum Initializer { SystemRandom, Seed }; + + static RandomOptions FromSystemRandom() { return RandomOptions{SystemRandom, 0}; } + static RandomOptions FromSeed(uint64_t seed) { return RandomOptions{Seed, seed}; } + + RandomOptions(Initializer initializer, uint64_t seed); + RandomOptions(); + static constexpr char const kTypeName[] = "RandomOptions"; + static RandomOptions Defaults() { return RandomOptions(); } + + /// The type of initialization for random number generation - system or provided seed. + Initializer initializer; + /// The seed value used to initialize the random number generation. + uint64_t seed; +}; + +/// Options for map_lookup function +class ARROW_EXPORT MapLookupOptions : public FunctionOptions { + public: + enum Occurrence { + /// Return the first matching value + FIRST, + /// Return the last matching value + LAST, + /// Return all matching values + ALL + }; + + explicit MapLookupOptions(std::shared_ptr query_key, Occurrence occurrence); + MapLookupOptions(); + + constexpr static char const kTypeName[] = "MapLookupOptions"; + + /// The key to lookup in the map + std::shared_ptr query_key; + + /// Whether to return the first, last, or all matching values + Occurrence occurrence; +}; + +/// @} + +/// \brief Get the absolute value of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the value transformed +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise absolute value +ARROW_EXPORT +Result AbsoluteValue(const Datum& arg, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Add two values together. Array values must be the same length. If +/// either addend is null the result will be null. +/// +/// \param[in] left the first addend +/// \param[in] right the second addend +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise sum +ARROW_EXPORT +Result Add(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Subtract two values. Array values must be the same length. If the +/// minuend or subtrahend is null the result will be null. +/// +/// \param[in] left the value subtracted from (minuend) +/// \param[in] right the value by which the minuend is reduced (subtrahend) +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise difference +ARROW_EXPORT +Result Subtract(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Multiply two values. Array values must be the same length. If either +/// factor is null the result will be null. +/// +/// \param[in] left the first factor +/// \param[in] right the second factor +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise product +ARROW_EXPORT +Result Multiply(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Divide two values. Array values must be the same length. If either +/// argument is null the result will be null. For integer types, if there is +/// a zero divisor, an error will be raised. +/// +/// \param[in] left the dividend +/// \param[in] right the divisor +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise quotient +ARROW_EXPORT +Result Divide(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Negate values. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the value negated +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise negation +ARROW_EXPORT +Result Negate(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Raise the values of base array to the power of the exponent array values. +/// Array values must be the same length. If either base or exponent is null the result +/// will be null. +/// +/// \param[in] left the base +/// \param[in] right the exponent +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise base value raised to the power of exponent +ARROW_EXPORT +Result Power(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Raise Euler's number to the power of specified exponent, element-wise. +/// If the exponent value is null the result will be null. +/// +/// \param[in] arg the exponent +/// \param[in] ctx the function execution context, optional +/// \return the element-wise Euler's number raised to the power of exponent +ARROW_EXPORT +Result Exp(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Left shift the left array by the right array. Array values must be the +/// same length. If either operand is null, the result will be null. +/// +/// \param[in] left the value to shift +/// \param[in] right the value to shift by +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise left value shifted left by the right value +ARROW_EXPORT +Result ShiftLeft(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Right shift the left array by the right array. Array values must be the +/// same length. If either operand is null, the result will be null. Performs a +/// logical shift for unsigned values, and an arithmetic shift for signed values. +/// +/// \param[in] left the value to shift +/// \param[in] right the value to shift by +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise left value shifted right by the right value +ARROW_EXPORT +Result ShiftRight(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the sine of the array values. +/// \param[in] arg The values to compute the sine for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise sine of the values +ARROW_EXPORT +Result Sin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the cosine of the array values. +/// \param[in] arg The values to compute the cosine for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise cosine of the values +ARROW_EXPORT +Result Cos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the inverse sine (arcsine) of the array values. +/// \param[in] arg The values to compute the inverse sine for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise inverse sine of the values +ARROW_EXPORT +Result Asin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the inverse cosine (arccosine) of the array values. +/// \param[in] arg The values to compute the inverse cosine for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise inverse cosine of the values +ARROW_EXPORT +Result Acos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the tangent of the array values. +/// \param[in] arg The values to compute the tangent for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise tangent of the values +ARROW_EXPORT +Result Tan(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the inverse tangent (arctangent) of the array values. +/// \param[in] arg The values to compute the inverse tangent for. +/// \param[in] ctx the function execution context, optional +/// \return the elementwise inverse tangent of the values +ARROW_EXPORT +Result Atan(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Compute the inverse tangent (arctangent) of y/x, using the +/// argument signs to determine the correct quadrant. +/// \param[in] y The y-values to compute the inverse tangent for. +/// \param[in] x The x-values to compute the inverse tangent for. +/// \param[in] ctx the function execution context, optional +/// \return the elementwise inverse tangent of the values +ARROW_EXPORT +Result Atan2(const Datum& y, const Datum& x, ExecContext* ctx = NULLPTR); + +/// \brief Get the natural log of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise natural log +ARROW_EXPORT +Result Ln(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the log base 10 of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise log base 10 +ARROW_EXPORT +Result Log10(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the log base 2 of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise log base 2 +ARROW_EXPORT +Result Log2(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the natural log of (1 + value). +/// +/// If argument is null the result will be null. +/// This function may be more accurate than Log(1 + value) for values close to zero. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise natural log +ARROW_EXPORT +Result Log1p(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the log of a value to the given base. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] base The given base. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise log to the given base +ARROW_EXPORT +Result Logb(const Datum& arg, const Datum& base, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the square-root of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg The values to compute the square-root for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise square-root +ARROW_EXPORT +Result Sqrt(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Round to the nearest integer less than or equal in magnitude to the +/// argument. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the value to round +/// \param[in] ctx the function execution context, optional +/// \return the rounded value +ARROW_EXPORT +Result Floor(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Round to the nearest integer greater than or equal in magnitude to the +/// argument. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the value to round +/// \param[in] ctx the function execution context, optional +/// \return the rounded value +ARROW_EXPORT +Result Ceil(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Get the integral part without fractional digits. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the value to truncate +/// \param[in] ctx the function execution context, optional +/// \return the truncated value +ARROW_EXPORT +Result Trunc(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Find the element-wise maximum of any number of arrays or scalars. +/// Array values must be the same length. +/// +/// \param[in] args arrays or scalars to operate on. +/// \param[in] options options for handling nulls, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise maximum +ARROW_EXPORT +Result MaxElementWise( + const std::vector& args, + ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Find the element-wise minimum of any number of arrays or scalars. +/// Array values must be the same length. +/// +/// \param[in] args arrays or scalars to operate on. +/// \param[in] options options for handling nulls, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise minimum +ARROW_EXPORT +Result MinElementWise( + const std::vector& args, + ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the sign of a value. Array values can be of arbitrary length. If argument +/// is null the result will be null. +/// +/// \param[in] arg the value to extract sign from +/// \param[in] ctx the function execution context, optional +/// \return the element-wise sign function +ARROW_EXPORT +Result Sign(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Round a value to a given precision. +/// +/// If arg is null the result will be null. +/// +/// \param[in] arg the value to be rounded +/// \param[in] options rounding options (rounding mode and number of digits), optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise rounded value +ARROW_EXPORT +Result Round(const Datum& arg, RoundOptions options = RoundOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Round a value to a given precision. +/// +/// If arg1 is null the result will be null. +/// If arg2 is null then the result will be null. If arg2 is negative, then the rounding +/// place will be shifted to the left (thus -1 would correspond to rounding to the nearest +/// ten). If positive, the rounding place will shift to the right (and +1 would +/// correspond to rounding to the nearest tenth). +/// +/// \param[in] arg1 the value to be rounded +/// \param[in] arg2 the number of significant digits to round to +/// \param[in] options rounding options, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise rounded value +ARROW_EXPORT +Result RoundBinary(const Datum& arg1, const Datum& arg2, + RoundBinaryOptions options = RoundBinaryOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Round a value to a given multiple. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the value to round +/// \param[in] options rounding options (rounding mode and multiple), optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise rounded value +ARROW_EXPORT +Result RoundToMultiple( + const Datum& arg, RoundToMultipleOptions options = RoundToMultipleOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Ceil a temporal value to a given frequency +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the temporal value to ceil +/// \param[in] options temporal rounding options, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise rounded value +/// +/// \since 7.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result CeilTemporal( + const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Floor a temporal value to a given frequency +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the temporal value to floor +/// \param[in] options temporal rounding options, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise rounded value +/// +/// \since 7.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result FloorTemporal( + const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Round a temporal value to a given frequency +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the temporal value to round +/// \param[in] options temporal rounding options, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise rounded value +/// +/// \since 7.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result RoundTemporal( + const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Invert the values of a boolean datum +/// \param[in] value datum to invert +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Invert(const Datum& value, ExecContext* ctx = NULLPTR); + +/// \brief Element-wise AND of two boolean datums which always propagates nulls +/// (null and false is null). +/// +/// \param[in] left left operand +/// \param[in] right right operand +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result And(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); + +/// \brief Element-wise AND of two boolean datums with a Kleene truth table +/// (null and false is false). +/// +/// \param[in] left left operand +/// \param[in] right right operand +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result KleeneAnd(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Element-wise OR of two boolean datums which always propagates nulls +/// (null and true is null). +/// +/// \param[in] left left operand +/// \param[in] right right operand +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Or(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); + +/// \brief Element-wise OR of two boolean datums with a Kleene truth table +/// (null or true is true). +/// +/// \param[in] left left operand +/// \param[in] right right operand +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); + +/// \brief Element-wise XOR of two boolean datums +/// \param[in] left left operand +/// \param[in] right right operand +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); + +/// \brief Element-wise AND NOT of two boolean datums which always propagates nulls +/// (null and not true is null). +/// +/// \param[in] left left operand +/// \param[in] right right operand +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 3.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result AndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); + +/// \brief Element-wise AND NOT of two boolean datums with a Kleene truth table +/// (false and not null is false, null and not true is false). +/// +/// \param[in] left left operand +/// \param[in] right right operand +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 3.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result KleeneAndNot(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief IsIn returns true for each element of `values` that is contained in +/// `value_set` +/// +/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls. +/// +/// \param[in] values array-like input to look up in value_set +/// \param[in] options SetLookupOptions +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IsIn(const Datum& values, const SetLookupOptions& options, + ExecContext* ctx = NULLPTR); +ARROW_EXPORT +Result IsIn(const Datum& values, const Datum& value_set, + ExecContext* ctx = NULLPTR); + +/// \brief IndexIn examines each slot in the values against a value_set array. +/// If the value is not found in value_set, null will be output. +/// If found, the index of occurrence within value_set (ignoring duplicates) +/// will be output. +/// +/// For example given values = [99, 42, 3, null] and +/// value_set = [3, 3, 99], the output will be = [2, null, 0, null] +/// +/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls. +/// +/// \param[in] values array-like input +/// \param[in] options SetLookupOptions +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IndexIn(const Datum& values, const SetLookupOptions& options, + ExecContext* ctx = NULLPTR); +ARROW_EXPORT +Result IndexIn(const Datum& values, const Datum& value_set, + ExecContext* ctx = NULLPTR); + +/// \brief IsValid returns true for each element of `values` that is not null, +/// false otherwise +/// +/// \param[in] values input to examine for validity +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief IsNull returns true for each element of `values` that is null, +/// false otherwise +/// +/// \param[in] values input to examine for nullity +/// \param[in] options NullOptions +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IsNull(const Datum& values, NullOptions options = NullOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief IsNan returns true for each element of `values` that is NaN, +/// false otherwise +/// +/// \param[in] values input to look for NaN +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 3.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IsNan(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief IfElse returns elements chosen from `left` or `right` +/// depending on `cond`. `null` values in `cond` will be promoted to the result +/// +/// \param[in] cond `Boolean` condition Scalar/ Array +/// \param[in] left Scalar/ Array +/// \param[in] right Scalar/ Array +/// \param[in] ctx the function execution context, optional +/// +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IfElse(const Datum& cond, const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief CaseWhen behaves like a switch/case or if-else if-else statement: for +/// each row, select the first value for which the corresponding condition is +/// true, or (if given) select the 'else' value, else emit null. Note that a +/// null condition is the same as false. +/// +/// \param[in] cond Conditions (Boolean) +/// \param[in] cases Values (any type), along with an optional 'else' value. +/// \param[in] ctx the function execution context, optional +/// +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result CaseWhen(const Datum& cond, const std::vector& cases, + ExecContext* ctx = NULLPTR); + +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief IsLeapYear returns if a year is a leap year for each element of `values` +/// +/// \param[in] values input to extract leap year indicator from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IsLeapYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief YearMonthDay returns a struct containing the Year, Month and Day value for +/// each element of `values`. +/// +/// \param[in] values input to extract (year, month, day) struct from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 7.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result YearMonthDay(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. +/// +/// By default week starts on Monday denoted by 0 and ends on Sunday denoted +/// by 6. Start day of the week (Monday=1, Sunday=7) and numbering base (0 or 1) can be +/// set using DayOfWeekOptions +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] options for setting start of the week and day numbering +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result DayOfWeek(const Datum& values, + DayOfWeekOptions options = DayOfWeekOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of its days in January. +/// +/// \param[in] values input to extract ISO year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief USYear returns US epidemiological year number for each element of `values`. +/// First week of US epidemiological year has the majority (4 or more) of it's +/// days in January. Last week of US epidemiological year has the year's last +/// Wednesday in it. US epidemiological week starts on Sunday. +/// +/// \param[in] values input to extract US epidemiological year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result USYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOWeek returns ISO week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of its days in January. +/// ISO week starts on Monday. Year can have 52 or 53 weeks. +/// Week numbering can start with 1. +/// +/// \param[in] values input to extract ISO week of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief USWeek returns US week of year number for each element of `values`. +/// First US week has the majority (4 or more) of its days in January. +/// US week starts on Sunday. Year can have 52 or 53 weeks. +/// Week numbering starts with 1. +/// +/// \param[in] values input to extract US week of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 6.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result USWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Week returns week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of its days in January. +/// Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 +/// depending on DayOfWeekOptions.count_from_zero. +/// +/// \param[in] values input to extract week of year from +/// \param[in] options for setting numbering start +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 6.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Week(const Datum& values, WeekOptions options = WeekOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for +/// each element of `values`. +/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7. +/// +/// \param[in] values input to ISO calendar struct from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Quarter returns the quarter of year number for each element of `values` +/// First quarter maps to 1 and fourth quarter maps to 4. +/// +/// \param[in] values input to extract quarter of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Quarter(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Hour returns hour value for each element of `values` +/// +/// \param[in] values input to extract hour from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Hour(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Minute returns minutes value for each element of `values` +/// +/// \param[in] values input to extract minutes from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Minute(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Second returns seconds value for each element of `values` +/// +/// \param[in] values input to extract seconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Second(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Millisecond returns number of milliseconds since the last full second +/// for each element of `values` +/// +/// \param[in] values input to extract milliseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Millisecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Microsecond returns number of microseconds since the last full millisecond +/// for each element of `values` +/// +/// \param[in] values input to extract microseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Microsecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Nanosecond returns number of nanoseconds since the last full millisecond +/// for each element of `values` +/// +/// \param[in] values input to extract nanoseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Subsecond returns the fraction of second elapsed since last full second +/// as a float for each element of `values` +/// +/// \param[in] values input to extract subsecond from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Subsecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Format timestamps according to a format string +/// +/// Return formatted time strings according to the format string +/// `StrftimeOptions::format` and to the locale specifier `Strftime::locale`. +/// +/// \param[in] values input timestamps +/// \param[in] options for setting format string and locale +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 6.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Strftime(const Datum& values, StrftimeOptions options, + ExecContext* ctx = NULLPTR); + +/// \brief Parse timestamps according to a format string +/// +/// Return parsed timestamps according to the format string +/// `StrptimeOptions::format` at time resolution `Strftime::unit`. Parse errors are +/// raised depending on the `Strftime::error_is_null` setting. +/// +/// \param[in] values input strings +/// \param[in] options for setting format string, unit and error_is_null +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Strptime(const Datum& values, StrptimeOptions options, + ExecContext* ctx = NULLPTR); + +/// \brief Converts timestamps from local timestamp without a timezone to a timestamp with +/// timezone, interpreting the local timestamp as being in the specified timezone for each +/// element of `values` +/// +/// \param[in] values input to convert +/// \param[in] options for setting source timezone, exception and ambiguous timestamp +/// handling. +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 6.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result AssumeTimezone(const Datum& values, + AssumeTimezoneOptions options, + ExecContext* ctx = NULLPTR); + +/// \brief IsDaylightSavings extracts if currently observing daylight savings for each +/// element of `values` +/// +/// \param[in] values input to extract daylight savings indicator from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result IsDaylightSavings(const Datum& values, + ExecContext* ctx = NULLPTR); + +/// \brief LocalTimestamp converts timestamp to timezone naive local timestamp +/// +/// \param[in] values input to convert to local time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 12.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result LocalTimestamp(const Datum& values, + ExecContext* ctx = NULLPTR); + +/// \brief Years Between finds the number of years between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result YearsBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Quarters Between finds the number of quarters between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result QuartersBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Months Between finds the number of month between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result MonthsBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Weeks Between finds the number of weeks between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result WeeksBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Month Day Nano Between finds the number of months, days, and nanoseconds +/// between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result MonthDayNanoBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief DayTime Between finds the number of days and milliseconds between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result DayTimeBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Days Between finds the number of days between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result DaysBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Hours Between finds the number of hours between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result HoursBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Minutes Between finds the number of minutes between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result MinutesBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Seconds Between finds the number of hours between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result SecondsBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Milliseconds Between finds the number of milliseconds between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result MillisecondsBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Microseconds Between finds the number of microseconds between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result MicrosecondsBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Nanoseconds Between finds the number of nanoseconds between two values +/// +/// \param[in] left input treated as the start time +/// \param[in] right input treated as the end time +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result NanosecondsBetween(const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief Finds either the FIRST, LAST, or ALL items with a key that matches the given +/// query key in a map. +/// +/// Returns an array of items for FIRST and LAST, and an array of list of items for ALL. +/// +/// \param[in] map to look in +/// \param[in] options to pass a query key and choose which matching keys to return +/// (FIRST, LAST or ALL) +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 8.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result MapLookup(const Datum& map, MapLookupOptions options, + ExecContext* ctx = NULLPTR); +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_vector.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_vector.h new file mode 100644 index 0000000000000000000000000000000000000000..e5bcc3732966185e00612619d64a86867e1f4fca --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api_vector.h @@ -0,0 +1,709 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/compute/function_options.h" +#include "arrow/compute/ordering.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" + +namespace arrow { +namespace compute { + +class ExecContext; + +/// \addtogroup compute-concrete-options +/// @{ + +class ARROW_EXPORT FilterOptions : public FunctionOptions { + public: + /// Configure the action taken when a slot of the selection mask is null + enum NullSelectionBehavior { + /// The corresponding filtered value will be removed in the output. + DROP, + /// The corresponding filtered value will be null in the output. + EMIT_NULL, + }; + + explicit FilterOptions(NullSelectionBehavior null_selection = DROP); + static constexpr char const kTypeName[] = "FilterOptions"; + static FilterOptions Defaults() { return FilterOptions(); } + + NullSelectionBehavior null_selection_behavior = DROP; +}; + +class ARROW_EXPORT TakeOptions : public FunctionOptions { + public: + explicit TakeOptions(bool boundscheck = true); + static constexpr char const kTypeName[] = "TakeOptions"; + static TakeOptions BoundsCheck() { return TakeOptions(true); } + static TakeOptions NoBoundsCheck() { return TakeOptions(false); } + static TakeOptions Defaults() { return BoundsCheck(); } + + bool boundscheck = true; +}; + +/// \brief Options for the dictionary encode function +class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions { + public: + /// Configure how null values will be encoded + enum NullEncodingBehavior { + /// The null value will be added to the dictionary with a proper index. + ENCODE, + /// The null value will be masked in the indices array. + MASK + }; + + explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK); + static constexpr char const kTypeName[] = "DictionaryEncodeOptions"; + static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); } + + NullEncodingBehavior null_encoding_behavior = MASK; +}; + +/// \brief Options for the run-end encode function +class ARROW_EXPORT RunEndEncodeOptions : public FunctionOptions { + public: + explicit RunEndEncodeOptions(std::shared_ptr run_end_type = int32()); + static constexpr char const kTypeName[] = "RunEndEncodeOptions"; + static RunEndEncodeOptions Defaults() { return RunEndEncodeOptions(); } + + std::shared_ptr run_end_type; +}; + +class ARROW_EXPORT ArraySortOptions : public FunctionOptions { + public: + explicit ArraySortOptions(SortOrder order = SortOrder::Ascending, + NullPlacement null_placement = NullPlacement::AtEnd); + static constexpr char const kTypeName[] = "ArraySortOptions"; + static ArraySortOptions Defaults() { return ArraySortOptions(); } + + /// Sorting order + SortOrder order; + /// Whether nulls and NaNs are placed at the start or at the end + NullPlacement null_placement; +}; + +class ARROW_EXPORT SortOptions : public FunctionOptions { + public: + explicit SortOptions(std::vector sort_keys = {}, + NullPlacement null_placement = NullPlacement::AtEnd); + explicit SortOptions(const Ordering& ordering); + static constexpr char const kTypeName[] = "SortOptions"; + static SortOptions Defaults() { return SortOptions(); } + /// Convenience constructor to create an ordering from SortOptions + /// + /// Note: Both classes contain the exact same information. However, + /// sort_options should only be used in a "function options" context while Ordering + /// is used more generally. + Ordering AsOrdering() && { return Ordering(std::move(sort_keys), null_placement); } + Ordering AsOrdering() const& { return Ordering(sort_keys, null_placement); } + + /// Column key(s) to order by and how to order by these sort keys. + std::vector sort_keys; + /// Whether nulls and NaNs are placed at the start or at the end + NullPlacement null_placement; +}; + +/// \brief SelectK options +class ARROW_EXPORT SelectKOptions : public FunctionOptions { + public: + explicit SelectKOptions(int64_t k = -1, std::vector sort_keys = {}); + static constexpr char const kTypeName[] = "SelectKOptions"; + static SelectKOptions Defaults() { return SelectKOptions(); } + + static SelectKOptions TopKDefault(int64_t k, std::vector key_names = {}) { + std::vector keys; + for (const auto& name : key_names) { + keys.emplace_back(SortKey(name, SortOrder::Descending)); + } + if (key_names.empty()) { + keys.emplace_back(SortKey("not-used", SortOrder::Descending)); + } + return SelectKOptions{k, keys}; + } + static SelectKOptions BottomKDefault(int64_t k, + std::vector key_names = {}) { + std::vector keys; + for (const auto& name : key_names) { + keys.emplace_back(SortKey(name, SortOrder::Ascending)); + } + if (key_names.empty()) { + keys.emplace_back(SortKey("not-used", SortOrder::Ascending)); + } + return SelectKOptions{k, keys}; + } + + /// The number of `k` elements to keep. + int64_t k; + /// Column key(s) to order by and how to order by these sort keys. + std::vector sort_keys; +}; + +/// \brief Rank options +class ARROW_EXPORT RankOptions : public FunctionOptions { + public: + /// Configure how ties between equal values are handled + enum Tiebreaker { + /// Ties get the smallest possible rank in sorted order. + Min, + /// Ties get the largest possible rank in sorted order. + Max, + /// Ranks are assigned in order of when ties appear in the input. + /// This ensures the ranks are a stable permutation of the input. + First, + /// The ranks span a dense [1, M] interval where M is the number + /// of distinct values in the input. + Dense + }; + + explicit RankOptions(std::vector sort_keys = {}, + NullPlacement null_placement = NullPlacement::AtEnd, + Tiebreaker tiebreaker = RankOptions::First); + /// Convenience constructor for array inputs + explicit RankOptions(SortOrder order, + NullPlacement null_placement = NullPlacement::AtEnd, + Tiebreaker tiebreaker = RankOptions::First) + : RankOptions({SortKey("", order)}, null_placement, tiebreaker) {} + + static constexpr char const kTypeName[] = "RankOptions"; + static RankOptions Defaults() { return RankOptions(); } + + /// Column key(s) to order by and how to order by these sort keys. + std::vector sort_keys; + /// Whether nulls and NaNs are placed at the start or at the end + NullPlacement null_placement; + /// Tiebreaker for dealing with equal values in ranks + Tiebreaker tiebreaker; +}; + +/// \brief Partitioning options for NthToIndices +class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { + public: + explicit PartitionNthOptions(int64_t pivot, + NullPlacement null_placement = NullPlacement::AtEnd); + PartitionNthOptions() : PartitionNthOptions(0) {} + static constexpr char const kTypeName[] = "PartitionNthOptions"; + + /// The index into the equivalent sorted array of the partition pivot element. + int64_t pivot; + /// Whether nulls and NaNs are partitioned at the start or at the end + NullPlacement null_placement; +}; + +/// \brief Options for cumulative functions +/// \note Also aliased as CumulativeSumOptions for backward compatibility +class ARROW_EXPORT CumulativeOptions : public FunctionOptions { + public: + explicit CumulativeOptions(bool skip_nulls = false); + explicit CumulativeOptions(double start, bool skip_nulls = false); + explicit CumulativeOptions(std::shared_ptr start, bool skip_nulls = false); + static constexpr char const kTypeName[] = "CumulativeOptions"; + static CumulativeOptions Defaults() { return CumulativeOptions(); } + + /// Optional starting value for cumulative operation computation, default depends on the + /// operation and input type. + /// - sum: 0 + /// - prod: 1 + /// - min: maximum of the input type + /// - max: minimum of the input type + /// - mean: start is ignored because it has no meaning for mean + std::optional> start; + + /// If true, nulls in the input are ignored and produce a corresponding null output. + /// When false, the first null encountered is propagated through the remaining output. + bool skip_nulls = false; +}; +using CumulativeSumOptions = CumulativeOptions; // For backward compatibility + +/// \brief Options for pairwise functions +class ARROW_EXPORT PairwiseOptions : public FunctionOptions { + public: + explicit PairwiseOptions(int64_t periods = 1); + static constexpr char const kTypeName[] = "PairwiseOptions"; + static PairwiseOptions Defaults() { return PairwiseOptions(); } + + /// Periods to shift for applying the binary operation, accepts negative values. + int64_t periods = 1; +}; + +/// \brief Options for list_flatten function +class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { + public: + explicit ListFlattenOptions(bool recursive = false); + static constexpr char const kTypeName[] = "ListFlattenOptions"; + static ListFlattenOptions Defaults() { return ListFlattenOptions(); } + + /// \brief If true, the list is flattened recursively until a non-list + /// array is formed. + bool recursive = false; +}; + +/// @} + +/// \brief Filter with a boolean selection filter +/// +/// The output will be populated with values from the input at positions +/// where the selection filter is not 0. Nulls in the filter will be handled +/// based on options.null_selection_behavior. +/// +/// For example given values = ["a", "b", "c", null, "e", "f"] and +/// filter = [0, 1, 1, 0, null, 1], the output will be +/// (null_selection_behavior == DROP) = ["b", "c", "f"] +/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"] +/// +/// \param[in] values array to filter +/// \param[in] filter indicates which values should be filtered out +/// \param[in] options configures null_selection_behavior +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +ARROW_EXPORT +Result Filter(const Datum& values, const Datum& filter, + const FilterOptions& options = FilterOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +namespace internal { + +// These internal functions are implemented in kernels/vector_selection.cc + +/// \brief Return the number of selected indices in the boolean filter +/// +/// \param filter a plain or run-end encoded boolean array with or without nulls +/// \param null_selection how to handle nulls in the filter +ARROW_EXPORT +int64_t GetFilterOutputSize(const ArraySpan& filter, + FilterOptions::NullSelectionBehavior null_selection); + +/// \brief Compute uint64 selection indices for use with Take given a boolean +/// filter +/// +/// \param filter a plain or run-end encoded boolean array with or without nulls +/// \param null_selection how to handle nulls in the filter +ARROW_EXPORT +Result> GetTakeIndices( + const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, + MemoryPool* memory_pool = default_memory_pool()); + +} // namespace internal + +/// \brief ReplaceWithMask replaces each value in the array corresponding +/// to a true value in the mask with the next element from `replacements`. +/// +/// \param[in] values Array input to replace +/// \param[in] mask Array or Scalar of Boolean mask values +/// \param[in] replacements The replacement values to draw from. There must +/// be as many replacement values as true values in the mask. +/// \param[in] ctx the function execution context, optional +/// +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result ReplaceWithMask(const Datum& values, const Datum& mask, + const Datum& replacements, ExecContext* ctx = NULLPTR); + +/// \brief FillNullForward fill null values in forward direction +/// +/// The output array will be of the same type as the input values +/// array, with replaced null values in forward direction. +/// +/// For example given values = ["a", "b", "c", null, null, "f"], +/// the output will be = ["a", "b", "c", "c", "c", "f"] +/// +/// \param[in] values datum from which to take +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +ARROW_EXPORT +Result FillNullForward(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief FillNullBackward fill null values in backward direction +/// +/// The output array will be of the same type as the input values +/// array, with replaced null values in backward direction. +/// +/// For example given values = ["a", "b", "c", null, null, "f"], +/// the output will be = ["a", "b", "c", "f", "f", "f"] +/// +/// \param[in] values datum from which to take +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +ARROW_EXPORT +Result FillNullBackward(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Take from an array of values at indices in another array +/// +/// The output array will be of the same type as the input values +/// array, with elements taken from the values array at the given +/// indices. If an index is null then the taken element will be null. +/// +/// For example given values = ["a", "b", "c", null, "e", "f"] and +/// indices = [2, 1, null, 3], the output will be +/// = [values[2], values[1], null, values[3]] +/// = ["c", "b", null, null] +/// +/// \param[in] values datum from which to take +/// \param[in] indices which values to take +/// \param[in] options options +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +ARROW_EXPORT +Result Take(const Datum& values, const Datum& indices, + const TakeOptions& options = TakeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Take with Array inputs and output +ARROW_EXPORT +Result> Take(const Array& values, const Array& indices, + const TakeOptions& options = TakeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Drop Null from an array of values +/// +/// The output array will be of the same type as the input values +/// array, with elements taken from the values array without nulls. +/// +/// For example given values = ["a", "b", "c", null, "e", "f"], +/// the output will be = ["a", "b", "c", "e", "f"] +/// +/// \param[in] values datum from which to take +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +ARROW_EXPORT +Result DropNull(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DropNull with Array inputs and output +ARROW_EXPORT +Result> DropNull(const Array& values, ExecContext* ctx = NULLPTR); + +/// \brief Return indices that partition an array around n-th sorted element. +/// +/// Find index of n-th(0 based) smallest value and perform indirect +/// partition of an array around that element. Output indices[0 ~ n-1] +/// holds values no greater than n-th element, and indices[n+1 ~ end] +/// holds values no less than n-th element. Elements in each partition +/// is not sorted. Nulls will be partitioned to the end of the output. +/// Output is not guaranteed to be stable. +/// +/// \param[in] values array to be partitioned +/// \param[in] n pivot array around sorted n-th element +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would partition an array +ARROW_EXPORT +Result> NthToIndices(const Array& values, int64_t n, + ExecContext* ctx = NULLPTR); + +/// \brief Return indices that partition an array around n-th sorted element. +/// +/// This overload takes a PartitionNthOptions specifying the pivot index +/// and the null handling. +/// +/// \param[in] values array to be partitioned +/// \param[in] options options including pivot index and null handling +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would partition an array +ARROW_EXPORT +Result> NthToIndices(const Array& values, + const PartitionNthOptions& options, + ExecContext* ctx = NULLPTR); + +/// \brief Return indices that would select the first `k` elements. +/// +/// Perform an indirect sort of the datum, keeping only the first `k` elements. The output +/// array will contain indices such that the item indicated by the k-th index will be in +/// the position it would be if the datum were sorted by `options.sort_keys`. However, +/// indices of null values will not be part of the output. The sort is not guaranteed to +/// be stable. +/// +/// \param[in] datum datum to be partitioned +/// \param[in] options options +/// \param[in] ctx the function execution context, optional +/// \return a datum with the same schema as the input +ARROW_EXPORT +Result> SelectKUnstable(const Datum& datum, + const SelectKOptions& options, + ExecContext* ctx = NULLPTR); + +/// \brief Return the indices that would sort an array. +/// +/// Perform an indirect sort of array. The output array will contain +/// indices that would sort an array, which would be the same length +/// as input. Nulls will be stably partitioned to the end of the output +/// regardless of order. +/// +/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order +/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0, +/// 3]. +/// +/// \param[in] array array to sort +/// \param[in] order ascending or descending +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would sort an array +ARROW_EXPORT +Result> SortIndices(const Array& array, + SortOrder order = SortOrder::Ascending, + ExecContext* ctx = NULLPTR); + +/// \brief Return the indices that would sort an array. +/// +/// This overload takes a ArraySortOptions specifying the sort order +/// and the null handling. +/// +/// \param[in] array array to sort +/// \param[in] options options including sort order and null handling +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would sort an array +ARROW_EXPORT +Result> SortIndices(const Array& array, + const ArraySortOptions& options, + ExecContext* ctx = NULLPTR); + +/// \brief Return the indices that would sort a chunked array. +/// +/// Perform an indirect sort of chunked array. The output array will +/// contain indices that would sort a chunked array, which would be +/// the same length as input. Nulls will be stably partitioned to the +/// end of the output regardless of order. +/// +/// For example given chunked_array = [[null, 1], [3.3], [null, 2, +/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2, +/// 4, 1, 0, 3]. +/// +/// \param[in] chunked_array chunked array to sort +/// \param[in] order ascending or descending +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would sort an array +ARROW_EXPORT +Result> SortIndices(const ChunkedArray& chunked_array, + SortOrder order = SortOrder::Ascending, + ExecContext* ctx = NULLPTR); + +/// \brief Return the indices that would sort a chunked array. +/// +/// This overload takes a ArraySortOptions specifying the sort order +/// and the null handling. +/// +/// \param[in] chunked_array chunked array to sort +/// \param[in] options options including sort order and null handling +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would sort an array +ARROW_EXPORT +Result> SortIndices(const ChunkedArray& chunked_array, + const ArraySortOptions& options, + ExecContext* ctx = NULLPTR); + +/// \brief Return the indices that would sort an input in the +/// specified order. Input is one of array, chunked array record batch +/// or table. +/// +/// Perform an indirect sort of input. The output array will contain +/// indices that would sort an input, which would be the same length +/// as input. Nulls will be stably partitioned to the start or to the end +/// of the output depending on SortOrder::null_placement. +/// +/// For example given input (table) = { +/// "column1": [[null, 1], [ 3, null, 2, 1]], +/// "column2": [[ 5], [3, null, null, 5, 5]], +/// } and options = { +/// {"column1", SortOrder::Ascending}, +/// {"column2", SortOrder::Descending}, +/// }, the output will be [5, 1, 4, 2, 0, 3]. +/// +/// \param[in] datum array, chunked array, record batch or table to sort +/// \param[in] options options +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would sort a table +ARROW_EXPORT +Result> SortIndices(const Datum& datum, const SortOptions& options, + ExecContext* ctx = NULLPTR); + +/// \brief Compute unique elements from an array-like object +/// +/// Note if a null occurs in the input it will NOT be included in the output. +/// +/// \param[in] datum array-like input +/// \param[in] ctx the function execution context, optional +/// \return result as Array +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result> Unique(const Datum& datum, ExecContext* ctx = NULLPTR); + +// Constants for accessing the output of ValueCounts +ARROW_EXPORT extern const char kValuesFieldName[]; +ARROW_EXPORT extern const char kCountsFieldName[]; +ARROW_EXPORT extern const int32_t kValuesFieldIndex; +ARROW_EXPORT extern const int32_t kCountsFieldIndex; + +/// \brief Return counts of unique elements from an array-like object. +/// +/// Note that the counts do not include counts for nulls in the array. These can be +/// obtained separately from metadata. +/// +/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values +/// which can lead to unexpected results if the input Array has these values. +/// +/// \param[in] value array-like input +/// \param[in] ctx the function execution context, optional +/// \return counts An array of structs. +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result> ValueCounts(const Datum& value, + ExecContext* ctx = NULLPTR); + +/// \brief Dictionary-encode values in an array-like object +/// +/// Any nulls encountered in the dictionary will be handled according to the +/// specified null encoding behavior. +/// +/// For example, given values ["a", "b", null, "a", null] the output will be +/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null] +/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"] +/// +/// If the input is already dictionary encoded this function is a no-op unless +/// it needs to modify the null_encoding (TODO) +/// +/// \param[in] data array-like input +/// \param[in] ctx the function execution context, optional +/// \param[in] options configures null encoding behavior +/// \return result with same shape and type as input +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result DictionaryEncode( + const Datum& data, + const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Run-end-encode values in an array-like object +/// +/// The returned run-end encoded type uses the same value type of the input and +/// run-end type defined in the options. +/// +/// \param[in] value array-like input +/// \param[in] options configures encoding behavior +/// \param[in] ctx the function execution context, optional +/// \return result with same shape but run-end encoded +/// +/// \since 12.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result RunEndEncode( + const Datum& value, + const RunEndEncodeOptions& options = RunEndEncodeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Decode a Run-End Encoded array to a plain array +/// +/// The output data type is the same as the values array type of run-end encoded +/// input. +/// +/// \param[in] value run-end-encoded input +/// \param[in] ctx the function execution context, optional +/// \return plain array resulting from decoding the run-end encoded input +/// +/// \since 12.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result RunEndDecode(const Datum& value, ExecContext* ctx = NULLPTR); + +/// \brief Compute the cumulative sum of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative sum behavior +/// \param[in] check_overflow whether to check for overflow, if true, return Invalid +/// status on overflow, otherwise wrap around on overflow +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeSum( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + bool check_overflow = false, ExecContext* ctx = NULLPTR); + +/// \brief Compute the cumulative product of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative prod behavior +/// \param[in] check_overflow whether to check for overflow, if true, return Invalid +/// status on overflow, otherwise wrap around on overflow +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeProd( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + bool check_overflow = false, ExecContext* ctx = NULLPTR); + +/// \brief Compute the cumulative max of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative max behavior +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeMax( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the cumulative min of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative min behavior +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeMin( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the cumulative mean of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative mean behavior, `start` is ignored +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeMean( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Return the first order difference of an array. +/// +/// Computes the first order difference of an array, i.e. +/// output[i] = input[i] - input[i - p] if i >= p +/// output[i] = null otherwise +/// where p is the period. For example, with p = 1, +/// Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5]. +/// With p = 2, +/// Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6] +/// p can also be negative, in which case the diff is computed in +/// the opposite direction. +/// \param[in] array array input +/// \param[in] options options, specifying overflow behavior and period +/// \param[in] check_overflow whether to return error on overflow +/// \param[in] ctx the function execution context, optional +/// \return result as array +ARROW_EXPORT +Result> PairwiseDiff(const Array& array, + const PairwiseOptions& options, + bool check_overflow = false, + ExecContext* ctx = NULLPTR); + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/cast.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/cast.h new file mode 100644 index 0000000000000000000000000000000000000000..18e56092dda2a5f8f997de5b5cd1c81262e77a8f --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/cast.h @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; + +namespace compute { + +class ExecContext; + +/// \addtogroup compute-concrete-options +/// @{ + +class ARROW_EXPORT CastOptions : public FunctionOptions { + public: + explicit CastOptions(bool safe = true); + + static constexpr char const kTypeName[] = "CastOptions"; + static CastOptions Safe(TypeHolder to_type = {}) { + CastOptions safe(true); + safe.to_type = std::move(to_type); + return safe; + } + + static CastOptions Unsafe(TypeHolder to_type = {}) { + CastOptions unsafe(false); + unsafe.to_type = std::move(to_type); + return unsafe; + } + + // Type being casted to. May be passed separate to eager function + // compute::Cast + TypeHolder to_type; + + bool allow_int_overflow; + bool allow_time_truncate; + bool allow_time_overflow; + bool allow_decimal_truncate; + bool allow_float_truncate; + // Indicate if conversions from Binary/FixedSizeBinary to string must + // validate the utf8 payload. + bool allow_invalid_utf8; + + /// true if the safety options all match CastOptions::Safe + /// + /// Note, if this returns false it does not mean is_unsafe will return true + bool is_safe() const; + /// true if the safety options all match CastOptions::Unsafe + /// + /// Note, if this returns false it does not mean is_safe will return true + bool is_unsafe() const; +}; + +/// @} + +/// \brief Return true if a cast function is defined +ARROW_EXPORT +bool CanCast(const DataType& from_type, const DataType& to_type); + +// ---------------------------------------------------------------------- +// Convenience invocation APIs for a number of kernels + +/// \brief Cast from one array type to another +/// \param[in] value array to cast +/// \param[in] to_type type to cast to +/// \param[in] options casting options +/// \param[in] ctx the function execution context, optional +/// \return the resulting array +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result> Cast(const Array& value, const TypeHolder& to_type, + const CastOptions& options = CastOptions::Safe(), + ExecContext* ctx = NULLPTR); + +/// \brief Cast from one array type to another +/// \param[in] value array to cast +/// \param[in] options casting options. The "to_type" field must be populated +/// \param[in] ctx the function execution context, optional +/// \return the resulting array +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Cast(const Datum& value, const CastOptions& options, + ExecContext* ctx = NULLPTR); + +/// \brief Cast from one value to another +/// \param[in] value datum to cast +/// \param[in] to_type type to cast to +/// \param[in] options casting options +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Cast(const Datum& value, const TypeHolder& to_type, + const CastOptions& options = CastOptions::Safe(), + ExecContext* ctx = NULLPTR); + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/exec.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/exec.h new file mode 100644 index 0000000000000000000000000000000000000000..3fbefe4a1ab7b7e432e07607f674b5de1c947cd5 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/exec.h @@ -0,0 +1,489 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/compute/expression.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/datum.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +// It seems like 64K might be a good default chunksize to use for execution +// based on the experience of other query processing systems. The current +// default is not to chunk contiguous arrays, though, but this may change in +// the future once parallel execution is implemented +static constexpr int64_t kDefaultExecChunksize = UINT16_MAX; + +/// \brief Context for expression-global variables and options used by +/// function evaluation +class ARROW_EXPORT ExecContext { + public: + // If no function registry passed, the default is used. + explicit ExecContext(MemoryPool* pool = default_memory_pool(), + ::arrow::internal::Executor* executor = NULLPTR, + FunctionRegistry* func_registry = NULLPTR); + + /// \brief The MemoryPool used for allocations, default is + /// default_memory_pool(). + MemoryPool* memory_pool() const { return pool_; } + + const ::arrow::internal::CpuInfo* cpu_info() const; + + /// \brief An Executor which may be used to parallelize execution. + ::arrow::internal::Executor* executor() const { return executor_; } + + /// \brief The FunctionRegistry for looking up functions by name and + /// selecting kernels for execution. Defaults to the library-global function + /// registry provided by GetFunctionRegistry. + FunctionRegistry* func_registry() const { return func_registry_; } + + // \brief Set maximum length unit of work for kernel execution. Larger + // contiguous array inputs will be split into smaller chunks, and, if + // possible and enabled, processed in parallel. The default chunksize is + // INT64_MAX, so contiguous arrays are not split. + void set_exec_chunksize(int64_t chunksize) { exec_chunksize_ = chunksize; } + + // \brief Maximum length for ExecBatch data chunks processed by + // kernels. Contiguous array inputs with longer length will be split into + // smaller chunks. + int64_t exec_chunksize() const { return exec_chunksize_; } + + /// \brief Set whether to use multiple threads for function execution. This + /// is not yet used. + void set_use_threads(bool use_threads = true) { use_threads_ = use_threads; } + + /// \brief If true, then utilize multiple threads where relevant for function + /// execution. This is not yet used. + bool use_threads() const { return use_threads_; } + + // Set the preallocation strategy for kernel execution as it relates to + // chunked execution. For chunked execution, whether via ChunkedArray inputs + // or splitting larger Array arguments into smaller pieces, contiguous + // allocation (if permitted by the kernel) will allocate one large array to + // write output into yielding it to the caller at the end. If this option is + // set to off, then preallocations will be performed independently for each + // chunk of execution + // + // TODO: At some point we might want the limit the size of contiguous + // preallocations. For example, even if the exec_chunksize is 64K or less, we + // might limit contiguous allocations to 1M records, say. + void set_preallocate_contiguous(bool preallocate) { + preallocate_contiguous_ = preallocate; + } + + /// \brief If contiguous preallocations should be used when doing chunked + /// execution as specified by exec_chunksize(). See + /// set_preallocate_contiguous() for more information. + bool preallocate_contiguous() const { return preallocate_contiguous_; } + + private: + MemoryPool* pool_; + ::arrow::internal::Executor* executor_; + FunctionRegistry* func_registry_; + int64_t exec_chunksize_ = std::numeric_limits::max(); + bool preallocate_contiguous_ = true; + bool use_threads_ = true; +}; + +// TODO: Consider standardizing on uint16 selection vectors and only use them +// when we can ensure that each value is 64K length or smaller + +/// \brief Container for an array of value selection indices that were +/// materialized from a filter. +/// +/// Columnar query engines (see e.g. [1]) have found that rather than +/// materializing filtered data, the filter can instead be converted to an +/// array of the "on" indices and then "fusing" these indices in operator +/// implementations. This is especially relevant for aggregations but also +/// applies to scalar operations. +/// +/// We are not yet using this so this is mostly a placeholder for now. +/// +/// [1]: http://cidrdb.org/cidr2005/papers/P19.pdf +class ARROW_EXPORT SelectionVector { + public: + explicit SelectionVector(std::shared_ptr data); + + explicit SelectionVector(const Array& arr); + + /// \brief Create SelectionVector from boolean mask + static Result> FromMask(const BooleanArray& arr); + + const int32_t* indices() const { return indices_; } + int32_t length() const; + + private: + std::shared_ptr data_; + const int32_t* indices_; +}; + +/// An index to represent that a batch does not belong to an ordered stream +constexpr int64_t kUnsequencedIndex = -1; + +/// \brief A unit of work for kernel execution. It contains a collection of +/// Array and Scalar values and an optional SelectionVector indicating that +/// there is an unmaterialized filter that either must be materialized, or (if +/// the kernel supports it) pushed down into the kernel implementation. +/// +/// ExecBatch is semantically similar to RecordBatch in that in a SQL context +/// it represents a collection of records, but constant "columns" are +/// represented by Scalar values rather than having to be converted into arrays +/// with repeated values. +/// +/// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight +/// than is desirable for this class. Microbenchmarks would help determine for +/// sure. See ARROW-8928. + +/// \addtogroup acero-internals +/// @{ + +struct ARROW_EXPORT ExecBatch { + ExecBatch() = default; + ExecBatch(std::vector values, int64_t length) + : values(std::move(values)), length(length) {} + + explicit ExecBatch(const RecordBatch& batch); + + /// \brief Infer the ExecBatch length from values. + static Result InferLength(const std::vector& values); + + /// Creates an ExecBatch with length-validation. + /// + /// If any value is given, then all values must have a common length. If the given + /// length is negative, then the length of the ExecBatch is set to this common length, + /// or to 1 if no values are given. Otherwise, the given length must equal the common + /// length, if any value is given. + static Result Make(std::vector values, int64_t length = -1); + + Result> ToRecordBatch( + std::shared_ptr schema, MemoryPool* pool = default_memory_pool()) const; + + /// The values representing positional arguments to be passed to a kernel's + /// exec function for processing. + std::vector values; + + /// A deferred filter represented as an array of indices into the values. + /// + /// For example, the filter [true, true, false, true] would be represented as + /// the selection vector [0, 1, 3]. When the selection vector is set, + /// ExecBatch::length is equal to the length of this array. + std::shared_ptr selection_vector; + + /// A predicate Expression guaranteed to evaluate to true for all rows in this batch. + Expression guarantee = literal(true); + + /// The semantic length of the ExecBatch. When the values are all scalars, + /// the length should be set to 1 for non-aggregate kernels, otherwise the + /// length is taken from the array values, except when there is a selection + /// vector. When there is a selection vector set, the length of the batch is + /// the length of the selection. Aggregate kernels can have an ExecBatch + /// formed by projecting just the partition columns from a batch in which + /// case, it would have scalar rows with length greater than 1. + /// + /// If the array values are of length 0 then the length is 0 regardless of + /// whether any values are Scalar. + int64_t length = 0; + + /// \brief index of this batch in a sorted stream of batches + /// + /// This index must be strictly monotonic starting at 0 without gaps or + /// it can be set to kUnsequencedIndex if there is no meaningful order + int64_t index = kUnsequencedIndex; + + /// \brief The sum of bytes in each buffer referenced by the batch + /// + /// Note: Scalars are not counted + /// Note: Some values may referenced only part of a buffer, for + /// example, an array with an offset. The actual data + /// visible to this batch will be smaller than the total + /// buffer size in this case. + int64_t TotalBufferSize() const; + + /// \brief Return the value at the i-th index + template + inline const Datum& operator[](index_type i) const { + return values[i]; + } + + bool Equals(const ExecBatch& other) const; + + /// \brief A convenience for the number of values / arguments. + int num_values() const { return static_cast(values.size()); } + + ExecBatch Slice(int64_t offset, int64_t length) const; + + Result SelectValues(const std::vector& ids) const; + + /// \brief A convenience for returning the types from the batch. + std::vector GetTypes() const { + std::vector result; + for (const auto& value : this->values) { + result.emplace_back(value.type()); + } + return result; + } + + std::string ToString() const; +}; + +inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); } +inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); } + +ARROW_EXPORT void PrintTo(const ExecBatch&, std::ostream*); + +/// @} + +/// \defgroup compute-internals Utilities for calling functions, useful for those +/// extending the function registry +/// +/// @{ + +struct ExecValue { + ArraySpan array = {}; + const Scalar* scalar = NULLPTR; + + ExecValue(Scalar* scalar) // NOLINT implicit conversion + : scalar(scalar) {} + + ExecValue(ArraySpan array) // NOLINT implicit conversion + : array(std::move(array)) {} + + ExecValue(const ArrayData& array) { // NOLINT implicit conversion + this->array.SetMembers(array); + } + + ExecValue() = default; + ExecValue(const ExecValue& other) = default; + ExecValue& operator=(const ExecValue& other) = default; + ExecValue(ExecValue&& other) = default; + ExecValue& operator=(ExecValue&& other) = default; + + int64_t length() const { return this->is_array() ? this->array.length : 1; } + + bool is_array() const { return this->scalar == NULLPTR; } + bool is_scalar() const { return !this->is_array(); } + + void SetArray(const ArrayData& array) { + this->array.SetMembers(array); + this->scalar = NULLPTR; + } + + void SetScalar(const Scalar* scalar) { this->scalar = scalar; } + + template + const ExactType& scalar_as() const { + return ::arrow::internal::checked_cast(*this->scalar); + } + + /// XXX: here temporarily for compatibility with datum, see + /// e.g. MakeStructExec in scalar_nested.cc + int64_t null_count() const { + if (this->is_array()) { + return this->array.GetNullCount(); + } else { + return this->scalar->is_valid ? 0 : 1; + } + } + + const DataType* type() const { + if (this->is_array()) { + return array.type; + } else { + return scalar->type.get(); + } + } +}; + +struct ARROW_EXPORT ExecResult { + // The default value of the variant is ArraySpan + std::variant> value; + + int64_t length() const { + if (this->is_array_span()) { + return this->array_span()->length; + } else { + return this->array_data()->length; + } + } + + const DataType* type() const { + if (this->is_array_span()) { + return this->array_span()->type; + } else { + return this->array_data()->type.get(); + } + } + + const ArraySpan* array_span() const { return &std::get(this->value); } + ArraySpan* array_span_mutable() { return &std::get(this->value); } + + bool is_array_span() const { return this->value.index() == 0; } + + const std::shared_ptr& array_data() const { + return std::get>(this->value); + } + ArrayData* array_data_mutable() { + return std::get>(this->value).get(); + } + + bool is_array_data() const { return this->value.index() == 1; } +}; + +/// \brief A "lightweight" column batch object which contains no +/// std::shared_ptr objects and does not have any memory ownership +/// semantics. Can represent a view onto an "owning" ExecBatch. +struct ARROW_EXPORT ExecSpan { + ExecSpan() = default; + ExecSpan(const ExecSpan& other) = default; + ExecSpan& operator=(const ExecSpan& other) = default; + ExecSpan(ExecSpan&& other) = default; + ExecSpan& operator=(ExecSpan&& other) = default; + + explicit ExecSpan(std::vector values, int64_t length) + : length(length), values(std::move(values)) {} + + explicit ExecSpan(const ExecBatch& batch) { + this->length = batch.length; + this->values.resize(batch.values.size()); + for (size_t i = 0; i < batch.values.size(); ++i) { + const Datum& in_value = batch[i]; + ExecValue* out_value = &this->values[i]; + if (in_value.is_array()) { + out_value->SetArray(*in_value.array()); + } else { + out_value->SetScalar(in_value.scalar().get()); + } + } + } + + /// \brief Return the value at the i-th index + template + inline const ExecValue& operator[](index_type i) const { + return values[i]; + } + + /// \brief A convenience for the number of values / arguments. + int num_values() const { return static_cast(values.size()); } + + std::vector GetTypes() const { + std::vector result; + for (const auto& value : this->values) { + result.emplace_back(value.type()); + } + return result; + } + + ExecBatch ToExecBatch() const { + ExecBatch result; + result.length = this->length; + for (const ExecValue& value : this->values) { + if (value.is_array()) { + result.values.push_back(value.array.ToArrayData()); + } else { + result.values.push_back(value.scalar->GetSharedPtr()); + } + } + return result; + } + + int64_t length = 0; + std::vector values; +}; + +/// \defgroup compute-call-function One-shot calls to compute functions +/// +/// @{ + +/// \brief One-shot invoker for all types of functions. +/// +/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs, +/// and wrapping of outputs. +ARROW_EXPORT +Result CallFunction(const std::string& func_name, const std::vector& args, + const FunctionOptions* options, ExecContext* ctx = NULLPTR); + +/// \brief Variant of CallFunction which uses a function's default options. +/// +/// NB: Some functions require FunctionOptions be provided. +ARROW_EXPORT +Result CallFunction(const std::string& func_name, const std::vector& args, + ExecContext* ctx = NULLPTR); + +/// \brief One-shot invoker for all types of functions. +/// +/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs, +/// and wrapping of outputs. +ARROW_EXPORT +Result CallFunction(const std::string& func_name, const ExecBatch& batch, + const FunctionOptions* options, ExecContext* ctx = NULLPTR); + +/// \brief Variant of CallFunction which uses a function's default options. +/// +/// NB: Some functions require FunctionOptions be provided. +ARROW_EXPORT +Result CallFunction(const std::string& func_name, const ExecBatch& batch, + ExecContext* ctx = NULLPTR); + +/// @} + +/// \defgroup compute-function-executor One-shot calls to obtain function executors +/// +/// @{ + +/// \brief One-shot executor provider for all types of functions. +/// +/// This function creates and initializes a `FunctionExecutor` appropriate +/// for the given function name, input types and function options. +ARROW_EXPORT +Result> GetFunctionExecutor( + const std::string& func_name, std::vector in_types, + const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR); + +/// \brief One-shot executor provider for all types of functions. +/// +/// This function creates and initializes a `FunctionExecutor` appropriate +/// for the given function name, input types (taken from the Datum arguments) +/// and function options. +ARROW_EXPORT +Result> GetFunctionExecutor( + const std::string& func_name, const std::vector& args, + const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR); + +/// @} + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/expression.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/expression.h new file mode 100644 index 0000000000000000000000000000000000000000..9a36a6d3368fb9ee0486c9dba9ab86ba10764dc7 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/expression.h @@ -0,0 +1,295 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/compute/type_fwd.h" +#include "arrow/datum.h" +#include "arrow/type_fwd.h" +#include "arrow/util/small_vector.h" + +namespace arrow { +namespace compute { + +/// \defgroup expression-core Expressions to describe data transformations +/// +/// @{ + +/// An unbound expression which maps a single Datum to another Datum. +/// An expression is one of +/// - A literal Datum. +/// - A reference to a single (potentially nested) field of the input Datum. +/// - A call to a compute function, with arguments specified by other Expressions. +class ARROW_EXPORT Expression { + public: + struct Call { + std::string function_name; + std::vector arguments; + std::shared_ptr options; + // Cached hash value + size_t hash; + + // post-Bind properties: + std::shared_ptr function; + const Kernel* kernel = NULLPTR; + std::shared_ptr kernel_state; + TypeHolder type; + + void ComputeHash(); + }; + + std::string ToString() const; + bool Equals(const Expression& other) const; + size_t hash() const; + struct Hash { + size_t operator()(const Expression& expr) const { return expr.hash(); } + }; + + /// Bind this expression to the given input type, looking up Kernels and field types. + /// Some expression simplification may be performed and implicit casts will be inserted. + /// Any state necessary for execution will be initialized and returned. + Result Bind(const TypeHolder& in, ExecContext* = NULLPTR) const; + Result Bind(const Schema& in_schema, ExecContext* = NULLPTR) const; + + // XXX someday + // Clone all KernelState in this bound expression. If any function referenced by this + // expression has mutable KernelState, it is not safe to execute or apply simplification + // passes to it (or copies of it!) from multiple threads. Cloning state produces new + // KernelStates where necessary to ensure that Expressions may be manipulated safely + // on multiple threads. + // Result CloneState() const; + // Status SetState(ExpressionState); + + /// Return true if all an expression's field references have explicit types + /// and all of its functions' kernels are looked up. + bool IsBound() const; + + /// Return true if this expression is composed only of Scalar literals, field + /// references, and calls to ScalarFunctions. + bool IsScalarExpression() const; + + /// Return true if this expression is literal and entirely null. + bool IsNullLiteral() const; + + /// Return true if this expression could evaluate to true. Will return true for any + /// unbound or non-boolean Expressions. IsSatisfiable does not (currently) do any + /// canonicalization or simplification of the expression, so even Expressions + /// which are unsatisfiable may spuriously return `true` here. This function is + /// intended for use in predicate pushdown where a filter expression is simplified + /// by a guarantee, so it assumes that trying to simplify again would be redundant. + bool IsSatisfiable() const; + + // XXX someday + // Result GetPipelines(); + + bool is_valid() const { return impl_ != NULLPTR; } + + /// Access a Call or return nullptr if this expression is not a call + const Call* call() const; + /// Access a Datum or return nullptr if this expression is not a literal + const Datum* literal() const; + /// Access a FieldRef or return nullptr if this expression is not a field_ref + const FieldRef* field_ref() const; + + /// The type to which this expression will evaluate + const DataType* type() const; + // XXX someday + // NullGeneralization::type nullable() const; + + struct Parameter { + FieldRef ref; + + // post-bind properties + TypeHolder type; + ::arrow::internal::SmallVector indices; + }; + const Parameter* parameter() const; + + Expression() = default; + explicit Expression(Call call); + explicit Expression(Datum literal); + explicit Expression(Parameter parameter); + + private: + using Impl = std::variant; + std::shared_ptr impl_; + + ARROW_FRIEND_EXPORT friend bool Identical(const Expression& l, const Expression& r); +}; + +inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); } +inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); } + +ARROW_EXPORT void PrintTo(const Expression&, std::ostream*); + +// Factories + +ARROW_EXPORT +Expression literal(Datum lit); + +template +Expression literal(Arg&& arg) { + return literal(Datum(std::forward(arg))); +} + +ARROW_EXPORT +Expression field_ref(FieldRef ref); + +ARROW_EXPORT +Expression call(std::string function, std::vector arguments, + std::shared_ptr options = NULLPTR); + +template ::value>::type> +Expression call(std::string function, std::vector arguments, + Options options) { + return call(std::move(function), std::move(arguments), + std::make_shared(std::move(options))); +} + +/// Assemble a list of all fields referenced by an Expression at any depth. +ARROW_EXPORT +std::vector FieldsInExpression(const Expression&); + +/// Check if the expression references any fields. +ARROW_EXPORT +bool ExpressionHasFieldRefs(const Expression&); + +struct ARROW_EXPORT KnownFieldValues; + +/// Assemble a mapping from field references to known values. This derives known values +/// from "equal" and "is_null" Expressions referencing a field and a literal. +ARROW_EXPORT +Result ExtractKnownFieldValues( + const Expression& guaranteed_true_predicate); + +/// @} + +/// \defgroup expression-passes Functions for modification of Expressions +/// +/// @{ +/// +/// These transform bound expressions. Some transforms utilize a guarantee, which is +/// provided as an Expression which is guaranteed to evaluate to true. The +/// guaranteed_true_predicate need not be bound, but canonicalization is currently +/// deferred to producers of guarantees. For example in order to be recognized as a +/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS +/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or +/// other semantically identical Expressions will not be recognized. + +/// Weak canonicalization which establishes guarantees for subsequent passes. Even +/// equivalent Expressions may result in different canonicalized expressions. +/// TODO this could be a strong canonicalization +ARROW_EXPORT +Result Canonicalize(Expression, ExecContext* = NULLPTR); + +/// Simplify Expressions based on literal arguments (for example, add(null, x) will always +/// be null so replace the call with a null literal). Includes early evaluation of all +/// calls whose arguments are entirely literal. +ARROW_EXPORT +Result FoldConstants(Expression); + +/// Simplify Expressions by replacing with known values of the fields which it references. +ARROW_EXPORT +Result ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values, + Expression); + +/// Simplify an expression by replacing subexpressions based on a guarantee: +/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is +/// used to remove redundant function calls from a filter expression or to replace a +/// reference to a constant-value field with a literal. +ARROW_EXPORT +Result SimplifyWithGuarantee(Expression, + const Expression& guaranteed_true_predicate); + +/// Replace all named field refs (e.g. "x" or "x.y") with field paths (e.g. [0] or [1,3]) +/// +/// This isn't usually needed and does not offer any simplification by itself. However, +/// it can be useful to normalize an expression to paths to make it simpler to work with. +ARROW_EXPORT Result RemoveNamedRefs(Expression expression); + +/// @} + +// Execution + +/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a +/// RecordBatch which may have missing or incorrectly ordered columns. +/// Missing fields will be replaced with null scalars. +ARROW_EXPORT Result MakeExecBatch(const Schema& full_schema, + const Datum& partial, + Expression guarantee = literal(true)); + +/// Execute a scalar expression against the provided state and input ExecBatch. This +/// expression must be bound. +ARROW_EXPORT +Result ExecuteScalarExpression(const Expression&, const ExecBatch& input, + ExecContext* = NULLPTR); + +/// Convenience function for invoking against a RecordBatch +ARROW_EXPORT +Result ExecuteScalarExpression(const Expression&, const Schema& full_schema, + const Datum& partial_input, ExecContext* = NULLPTR); + +// Serialization + +ARROW_EXPORT +Result> Serialize(const Expression&); + +ARROW_EXPORT +Result Deserialize(std::shared_ptr); + +/// \defgroup expression-convenience Helpers for convenient expression creation +/// +/// @{ + +ARROW_EXPORT Expression project(std::vector values, + std::vector names); + +ARROW_EXPORT Expression equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression less(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression greater(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false); + +ARROW_EXPORT Expression is_valid(Expression lhs); + +ARROW_EXPORT Expression and_(Expression lhs, Expression rhs); +ARROW_EXPORT Expression and_(const std::vector&); +ARROW_EXPORT Expression or_(Expression lhs, Expression rhs); +ARROW_EXPORT Expression or_(const std::vector&); +ARROW_EXPORT Expression not_(Expression operand); + +/// @} + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/function.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/function.h new file mode 100644 index 0000000000000000000000000000000000000000..2b86f642166e2ccb8a49e3842d98120d59cb25e6 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/function.h @@ -0,0 +1,409 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle. + +#pragma once + +#include +#include +#include + +#include "arrow/compute/kernel.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/datum.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/compare.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +/// \addtogroup compute-functions +/// @{ + +/// \brief Contains the number of required arguments for the function. +/// +/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity. +struct ARROW_EXPORT Arity { + /// \brief A function taking no arguments + static Arity Nullary() { return Arity(0, false); } + + /// \brief A function taking 1 argument + static Arity Unary() { return Arity(1, false); } + + /// \brief A function taking 2 arguments + static Arity Binary() { return Arity(2, false); } + + /// \brief A function taking 3 arguments + static Arity Ternary() { return Arity(3, false); } + + /// \brief A function taking a variable number of arguments + /// + /// \param[in] min_args the minimum number of arguments required when + /// invoking the function + static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); } + + // NOTE: the 0-argument form (default constructor) is required for Cython + explicit Arity(int num_args = 0, bool is_varargs = false) + : num_args(num_args), is_varargs(is_varargs) {} + + /// The number of required arguments (or the minimum number for varargs + /// functions). + int num_args; + + /// If true, then the num_args is the minimum number of required arguments. + bool is_varargs = false; +}; + +struct ARROW_EXPORT FunctionDoc { + /// \brief A one-line summary of the function, using a verb. + /// + /// For example, "Add two numeric arrays or scalars". + std::string summary; + + /// \brief A detailed description of the function, meant to follow the summary. + std::string description; + + /// \brief Symbolic names (identifiers) for the function arguments. + /// + /// Some bindings may use this to generate nicer function signatures. + std::vector arg_names; + + // TODO add argument descriptions? + + /// \brief Name of the options class, if any. + std::string options_class; + + /// \brief Whether options are required for function execution + /// + /// If false, then either the function does not have an options class + /// or there is a usable default options value. + bool options_required; + + FunctionDoc() = default; + + FunctionDoc(std::string summary, std::string description, + std::vector arg_names, std::string options_class = "", + bool options_required = false) + : summary(std::move(summary)), + description(std::move(description)), + arg_names(std::move(arg_names)), + options_class(std::move(options_class)), + options_required(options_required) {} + + static const FunctionDoc& Empty(); +}; + +/// \brief An executor of a function with a preconfigured kernel +class ARROW_EXPORT FunctionExecutor { + public: + virtual ~FunctionExecutor() = default; + /// \brief Initialize or re-initialize the preconfigured kernel + /// + /// This method may be called zero or more times. Depending on how + /// the FunctionExecutor was obtained, it may already have been initialized. + virtual Status Init(const FunctionOptions* options = NULLPTR, + ExecContext* exec_ctx = NULLPTR) = 0; + /// \brief Execute the preconfigured kernel with arguments that must fit it + /// + /// The method requires the arguments be castable to the preconfigured types. + /// + /// \param[in] args Arguments to execute the function on + /// \param[in] length Length of arguments batch or -1 to default it. If the + /// function has no parameters, this determines the batch length, defaulting + /// to 0. Otherwise, if the function is scalar, this must equal the argument + /// batch's inferred length or be -1 to default to it. This is ignored for + /// vector functions. + virtual Result Execute(const std::vector& args, int64_t length = -1) = 0; +}; + +/// \brief Base class for compute functions. Function implementations contain a +/// collection of "kernels" which are implementations of the function for +/// specific argument types. Selecting a viable kernel for executing a function +/// is referred to as "dispatching". +class ARROW_EXPORT Function { + public: + /// \brief The kind of function, which indicates in what contexts it is + /// valid for use. + enum Kind { + /// A function that performs scalar data operations on whole arrays of + /// data. Can generally process Array or Scalar values. The size of the + /// output will be the same as the size (or broadcasted size, in the case + /// of mixing Array and Scalar inputs) of the input. + SCALAR, + + /// A function with array input and output whose behavior depends on the + /// values of the entire arrays passed, rather than the value of each scalar + /// value. + VECTOR, + + /// A function that computes scalar summary statistics from array input. + SCALAR_AGGREGATE, + + /// A function that computes grouped summary statistics from array input + /// and an array of group identifiers. + HASH_AGGREGATE, + + /// A function that dispatches to other functions and does not contain its + /// own kernels. + META + }; + + virtual ~Function() = default; + + /// \brief The name of the kernel. The registry enforces uniqueness of names. + const std::string& name() const { return name_; } + + /// \brief The kind of kernel, which indicates in what contexts it is valid + /// for use. + Function::Kind kind() const { return kind_; } + + /// \brief Contains the number of arguments the function requires, or if the + /// function accepts variable numbers of arguments. + const Arity& arity() const { return arity_; } + + /// \brief Return the function documentation + const FunctionDoc& doc() const { return doc_; } + + /// \brief Returns the number of registered kernels for this function. + virtual int num_kernels() const = 0; + + /// \brief Return a kernel that can execute the function given the exact + /// argument types (without implicit type casts). + /// + /// NB: This function is overridden in CastFunction. + virtual Result DispatchExact(const std::vector& types) const; + + /// \brief Return a best-match kernel that can execute the function given the argument + /// types, after implicit casts are applied. + /// + /// \param[in,out] values Argument types. An element may be modified to + /// indicate that the returned kernel only approximately matches the input + /// value descriptors; callers are responsible for casting inputs to the type + /// required by the kernel. + virtual Result DispatchBest(std::vector* values) const; + + /// \brief Get a function executor with a best-matching kernel + /// + /// The returned executor will by default work with the default FunctionOptions + /// and KernelContext. If you want to change that, call `FunctionExecutor::Init`. + virtual Result> GetBestExecutor( + std::vector inputs) const; + + /// \brief Execute the function eagerly with the passed input arguments with + /// kernel dispatch, batch iteration, and memory allocation details taken + /// care of. + /// + /// If the `options` pointer is null, then `default_options()` will be used. + /// + /// This function can be overridden in subclasses. + virtual Result Execute(const std::vector& args, + const FunctionOptions* options, ExecContext* ctx) const; + + virtual Result Execute(const ExecBatch& batch, const FunctionOptions* options, + ExecContext* ctx) const; + + /// \brief Returns the default options for this function. + /// + /// Whatever option semantics a Function has, implementations must guarantee + /// that default_options() is valid to pass to Execute as options. + const FunctionOptions* default_options() const { return default_options_; } + + virtual Status Validate() const; + + /// \brief Returns the pure property for this function. + /// + /// Impure functions are those that may return different results for the same + /// input arguments. For example, a function that returns a random number is + /// not pure. An expression containing only pure functions can be simplified by + /// pre-evaluating any sub-expressions that have constant arguments. + virtual bool is_pure() const { return true; } + + protected: + Function(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc, + const FunctionOptions* default_options) + : name_(std::move(name)), + kind_(kind), + arity_(arity), + doc_(std::move(doc)), + default_options_(default_options) {} + + Status CheckArity(size_t num_args) const; + + std::string name_; + Function::Kind kind_; + Arity arity_; + const FunctionDoc doc_; + const FunctionOptions* default_options_ = NULLPTR; +}; + +namespace detail { + +template +class FunctionImpl : public Function { + public: + /// \brief Return pointers to current-available kernels for inspection + std::vector kernels() const { + std::vector result; + for (const auto& kernel : kernels_) { + result.push_back(&kernel); + } + return result; + } + + int num_kernels() const override { return static_cast(kernels_.size()); } + + protected: + FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc, + const FunctionOptions* default_options) + : Function(std::move(name), kind, arity, std::move(doc), default_options) {} + + std::vector kernels_; +}; + +/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned. +ARROW_EXPORT +const Kernel* DispatchExactImpl(const Function* func, const std::vector&); + +/// \brief Return an error message if no Kernel is found. +ARROW_EXPORT +Status NoMatchingKernel(const Function* func, const std::vector&); + +} // namespace detail + +/// \brief A function that executes elementwise operations on arrays or +/// scalars, and therefore whose results generally do not depend on the order +/// of the values in the arguments. Accepts and returns arrays that are all of +/// the same size. These functions roughly correspond to the functions used in +/// SQL expressions. +class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl { + public: + using KernelType = ScalarKernel; + + ScalarFunction(std::string name, const Arity& arity, FunctionDoc doc, + const FunctionOptions* default_options = NULLPTR, bool is_pure = true) + : detail::FunctionImpl(std::move(name), Function::SCALAR, arity, + std::move(doc), default_options), + is_pure_(is_pure) {} + + /// \brief Add a kernel with given input/output types, no required state + /// initialization, preallocation for fixed-width types, and default null + /// handling (intersect validity bitmaps of inputs). + Status AddKernel(std::vector in_types, OutputType out_type, + ArrayKernelExec exec, KernelInit init = NULLPTR); + + /// \brief Add a kernel (function implementation). Returns error if the + /// kernel's signature does not match the function's arity. + Status AddKernel(ScalarKernel kernel); + + /// \brief Returns the pure property for this function. + bool is_pure() const override { return is_pure_; } + + private: + const bool is_pure_; +}; + +/// \brief A function that executes general array operations that may yield +/// outputs of different sizes or have results that depend on the whole array +/// contents. These functions roughly correspond to the functions found in +/// non-SQL array languages like APL and its derivatives. +class ARROW_EXPORT VectorFunction : public detail::FunctionImpl { + public: + using KernelType = VectorKernel; + + VectorFunction(std::string name, const Arity& arity, FunctionDoc doc, + const FunctionOptions* default_options = NULLPTR) + : detail::FunctionImpl(std::move(name), Function::VECTOR, arity, + std::move(doc), default_options) {} + + /// \brief Add a simple kernel with given input/output types, no required + /// state initialization, no data preallocation, and no preallocation of the + /// validity bitmap. + Status AddKernel(std::vector in_types, OutputType out_type, + ArrayKernelExec exec, KernelInit init = NULLPTR); + + /// \brief Add a kernel (function implementation). Returns error if the + /// kernel's signature does not match the function's arity. + Status AddKernel(VectorKernel kernel); +}; + +class ARROW_EXPORT ScalarAggregateFunction + : public detail::FunctionImpl { + public: + using KernelType = ScalarAggregateKernel; + + ScalarAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc, + const FunctionOptions* default_options = NULLPTR) + : detail::FunctionImpl(std::move(name), + Function::SCALAR_AGGREGATE, arity, + std::move(doc), default_options) {} + + /// \brief Add a kernel (function implementation). Returns error if the + /// kernel's signature does not match the function's arity. + Status AddKernel(ScalarAggregateKernel kernel); +}; + +class ARROW_EXPORT HashAggregateFunction + : public detail::FunctionImpl { + public: + using KernelType = HashAggregateKernel; + + HashAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc, + const FunctionOptions* default_options = NULLPTR) + : detail::FunctionImpl(std::move(name), + Function::HASH_AGGREGATE, arity, + std::move(doc), default_options) {} + + /// \brief Add a kernel (function implementation). Returns error if the + /// kernel's signature does not match the function's arity. + Status AddKernel(HashAggregateKernel kernel); +}; + +/// \brief A function that dispatches to other functions. Must implement +/// MetaFunction::ExecuteImpl. +/// +/// For Array, ChunkedArray, and Scalar Datum kinds, may rely on the execution +/// of concrete Function types, but must handle other Datum kinds on its own. +class ARROW_EXPORT MetaFunction : public Function { + public: + int num_kernels() const override { return 0; } + + Result Execute(const std::vector& args, const FunctionOptions* options, + ExecContext* ctx) const override; + + Result Execute(const ExecBatch& batch, const FunctionOptions* options, + ExecContext* ctx) const override; + + protected: + virtual Result ExecuteImpl(const std::vector& args, + const FunctionOptions* options, + ExecContext* ctx) const = 0; + + MetaFunction(std::string name, const Arity& arity, FunctionDoc doc, + const FunctionOptions* default_options = NULLPTR) + : Function(std::move(name), Function::META, arity, std::move(doc), + default_options) {} +}; + +/// @} + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/function_options.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/function_options.h new file mode 100644 index 0000000000000000000000000000000000000000..88ec2fd2d0679b5c849549179aa652bec9b37b56 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/function_options.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle. + +#pragma once + +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +/// \addtogroup compute-functions +/// @{ + +/// \brief Extension point for defining options outside libarrow (but +/// still within this project). +class ARROW_EXPORT FunctionOptionsType { + public: + virtual ~FunctionOptionsType() = default; + + virtual const char* type_name() const = 0; + virtual std::string Stringify(const FunctionOptions&) const = 0; + virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; + virtual Result> Serialize(const FunctionOptions&) const; + virtual Result> Deserialize( + const Buffer& buffer) const; + virtual std::unique_ptr Copy(const FunctionOptions&) const = 0; +}; + +/// \brief Base class for specifying options configuring a function's behavior, +/// such as error handling. +class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { + public: + virtual ~FunctionOptions() = default; + + const FunctionOptionsType* options_type() const { return options_type_; } + const char* type_name() const { return options_type()->type_name(); } + + bool Equals(const FunctionOptions& other) const; + std::string ToString() const; + std::unique_ptr Copy() const; + /// \brief Serialize an options struct to a buffer. + Result> Serialize() const; + /// \brief Deserialize an options struct from a buffer. + /// Note: this will only look for `type_name` in the default FunctionRegistry; + /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then + /// call FunctionOptionsType::Deserialize(). + static Result> Deserialize( + const std::string& type_name, const Buffer& buffer); + + protected: + explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} + const FunctionOptionsType* options_type_; +}; + +ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); + +/// @} + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/kernel.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..cfb6265f12904bf3c7c16f272f942ead1765b444 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/kernel.h @@ -0,0 +1,753 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/compute/exec.h" +#include "arrow/datum.h" +#include "arrow/device_allocation_type_set.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +// macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h. +// No other BSD seems to do so. The name is used as an identifier in MemAllocation enum. +#if defined(__APPLE__) && defined(PREALLOCATE) +# undef PREALLOCATE +#endif + +namespace arrow { +namespace compute { + +class FunctionOptions; + +/// \brief Base class for opaque kernel-specific state. For example, if there +/// is some kind of initialization required. +struct ARROW_EXPORT KernelState { + virtual ~KernelState() = default; +}; + +/// \brief Context/state for the execution of a particular kernel. +class ARROW_EXPORT KernelContext { + public: + // Can pass optional backreference; not used consistently for the + // moment but will be made so in the future + explicit KernelContext(ExecContext* exec_ctx, const Kernel* kernel = NULLPTR) + : exec_ctx_(exec_ctx), kernel_(kernel) {} + + /// \brief Allocate buffer from the context's memory pool. The contents are + /// not initialized. + Result> Allocate(int64_t nbytes); + + /// \brief Allocate buffer for bitmap from the context's memory pool. Like + /// Allocate, the contents of the buffer are not initialized but the last + /// byte is preemptively zeroed to help avoid ASAN or valgrind issues. + Result> AllocateBitmap(int64_t num_bits); + + /// \brief Assign the active KernelState to be utilized for each stage of + /// kernel execution. Ownership and memory lifetime of the KernelState must + /// be minded separately. + void SetState(KernelState* state) { state_ = state; } + + // Set kernel that is being invoked since some kernel + // implementations will examine the kernel state. + void SetKernel(const Kernel* kernel) { kernel_ = kernel; } + + KernelState* state() { return state_; } + + /// \brief Configuration related to function execution that is to be shared + /// across multiple kernels. + ExecContext* exec_context() { return exec_ctx_; } + + /// \brief The memory pool to use for allocations. For now, it uses the + /// MemoryPool contained in the ExecContext used to create the KernelContext. + MemoryPool* memory_pool() { return exec_ctx_->memory_pool(); } + + const Kernel* kernel() const { return kernel_; } + + private: + ExecContext* exec_ctx_; + KernelState* state_ = NULLPTR; + const Kernel* kernel_ = NULLPTR; +}; + +/// \brief An type-checking interface to permit customizable validation rules +/// for use with InputType and KernelSignature. This is for scenarios where the +/// acceptance is not an exact type instance, such as a TIMESTAMP type for a +/// specific TimeUnit, but permitting any time zone. +struct ARROW_EXPORT TypeMatcher { + virtual ~TypeMatcher() = default; + + /// \brief Return true if this matcher accepts the data type. + virtual bool Matches(const DataType& type) const = 0; + + /// \brief A human-interpretable string representation of what the type + /// matcher checks for, usable when printing KernelSignature or formatting + /// error messages. + virtual std::string ToString() const = 0; + + /// \brief Return true if this TypeMatcher contains the same matching rule as + /// the other. Currently depends on RTTI. + virtual bool Equals(const TypeMatcher& other) const = 0; +}; + +namespace match { + +/// \brief Match any DataType instance having the same DataType::id. +ARROW_EXPORT std::shared_ptr SameTypeId(Type::type type_id); + +/// \brief Match any TimestampType instance having the same unit, but the time +/// zones can be different. +ARROW_EXPORT std::shared_ptr TimestampTypeUnit(TimeUnit::type unit); +ARROW_EXPORT std::shared_ptr Time32TypeUnit(TimeUnit::type unit); +ARROW_EXPORT std::shared_ptr Time64TypeUnit(TimeUnit::type unit); +ARROW_EXPORT std::shared_ptr DurationTypeUnit(TimeUnit::type unit); + +// \brief Match any integer type +ARROW_EXPORT std::shared_ptr Integer(); + +// Match types using 32-bit varbinary representation +ARROW_EXPORT std::shared_ptr BinaryLike(); + +// Match types using 64-bit varbinary representation +ARROW_EXPORT std::shared_ptr LargeBinaryLike(); + +// Match any fixed binary type +ARROW_EXPORT std::shared_ptr FixedSizeBinaryLike(); + +// \brief Match any primitive type (boolean or any type representable as a C +// Type) +ARROW_EXPORT std::shared_ptr Primitive(); + +// \brief Match any integer type that can be used as run-end in run-end encoded +// arrays +ARROW_EXPORT std::shared_ptr RunEndInteger(); + +/// \brief Match run-end encoded types that use any valid run-end type and +/// encode specific value types +/// +/// @param[in] value_type_matcher a matcher that is applied to the values field +ARROW_EXPORT std::shared_ptr RunEndEncoded( + std::shared_ptr value_type_matcher); + +/// \brief Match run-end encoded types that use any valid run-end type and +/// encode specific value types +/// +/// @param[in] value_type_id a type id that the type of the values field should match +ARROW_EXPORT std::shared_ptr RunEndEncoded(Type::type value_type_id); + +/// \brief Match run-end encoded types that encode specific run-end and value types +/// +/// @param[in] run_end_type_matcher a matcher that is applied to the run_ends field +/// @param[in] value_type_matcher a matcher that is applied to the values field +ARROW_EXPORT std::shared_ptr RunEndEncoded( + std::shared_ptr run_end_type_matcher, + std::shared_ptr value_type_matcher); + +} // namespace match + +/// \brief An object used for type-checking arguments to be passed to a kernel +/// and stored in a KernelSignature. The type-checking rule can be supplied +/// either with an exact DataType instance or a custom TypeMatcher. +class ARROW_EXPORT InputType { + public: + /// \brief The kind of type-checking rule that the InputType contains. + enum Kind { + /// \brief Accept any value type. + ANY_TYPE, + + /// \brief A fixed arrow::DataType and will only exact match having this + /// exact type (e.g. same TimestampType unit, same decimal scale and + /// precision, or same nested child types). + EXACT_TYPE, + + /// \brief Uses a TypeMatcher implementation to check the type. + USE_TYPE_MATCHER + }; + + /// \brief Accept any value type + InputType() : kind_(ANY_TYPE) {} + + /// \brief Accept an exact value type. + InputType(std::shared_ptr type) // NOLINT implicit construction + : kind_(EXACT_TYPE), type_(std::move(type)) {} + + /// \brief Use the passed TypeMatcher to type check. + InputType(std::shared_ptr type_matcher) // NOLINT implicit construction + : kind_(USE_TYPE_MATCHER), type_matcher_(std::move(type_matcher)) {} + + /// \brief Match any type with the given Type::type. Uses a TypeMatcher for + /// its implementation. + InputType(Type::type type_id) // NOLINT implicit construction + : InputType(match::SameTypeId(type_id)) {} + + InputType(const InputType& other) { CopyInto(other); } + + void operator=(const InputType& other) { CopyInto(other); } + + InputType(InputType&& other) { MoveInto(std::forward(other)); } + + void operator=(InputType&& other) { MoveInto(std::forward(other)); } + + // \brief Match any input (array, scalar of any type) + static InputType Any() { return InputType(); } + + /// \brief Return true if this input type matches the same type cases as the + /// other. + bool Equals(const InputType& other) const; + + bool operator==(const InputType& other) const { return this->Equals(other); } + + bool operator!=(const InputType& other) const { return !(*this == other); } + + /// \brief Return hash code. + size_t Hash() const; + + /// \brief Render a human-readable string representation. + std::string ToString() const; + + /// \brief Return true if the Datum matches this argument kind in + /// type (and only allows scalar or array-like Datums). + bool Matches(const Datum& value) const; + + /// \brief Return true if the type matches this InputType + bool Matches(const DataType& type) const; + + /// \brief The type matching rule that this InputType uses. + Kind kind() const { return kind_; } + + /// \brief For InputType::EXACT_TYPE kind, the exact type that this InputType + /// must match. Otherwise this function should not be used and will assert in + /// debug builds. + const std::shared_ptr& type() const; + + /// \brief For InputType::USE_TYPE_MATCHER, the TypeMatcher to be used for + /// checking the type of a value. Otherwise this function should not be used + /// and will assert in debug builds. + const TypeMatcher& type_matcher() const; + + private: + void CopyInto(const InputType& other) { + this->kind_ = other.kind_; + this->type_ = other.type_; + this->type_matcher_ = other.type_matcher_; + } + + void MoveInto(InputType&& other) { + this->kind_ = other.kind_; + this->type_ = std::move(other.type_); + this->type_matcher_ = std::move(other.type_matcher_); + } + + Kind kind_; + + // For EXACT_TYPE Kind + std::shared_ptr type_; + + // For USE_TYPE_MATCHER Kind + std::shared_ptr type_matcher_; +}; + +/// \brief Container to capture both exact and input-dependent output types. +class ARROW_EXPORT OutputType { + public: + /// \brief An enum indicating whether the value type is an invariant fixed + /// value or one that's computed by a kernel-defined resolver function. + enum ResolveKind { FIXED, COMPUTED }; + + /// Type resolution function. Given input types, return output type. This + /// function MAY may use the kernel state to decide the output type based on + /// the FunctionOptions. + /// + /// This function SHOULD _not_ be used to check for arity, that is to be + /// performed one or more layers above. + using Resolver = + std::function(KernelContext*, const std::vector&)>; + + /// \brief Output an exact type + OutputType(std::shared_ptr type) // NOLINT implicit construction + : kind_(FIXED), type_(std::move(type)) {} + + /// \brief Output a computed type depending on actual input types + template + OutputType(Fn resolver) // NOLINT implicit construction + : kind_(COMPUTED), resolver_(std::move(resolver)) {} + + OutputType(const OutputType& other) { + this->kind_ = other.kind_; + this->type_ = other.type_; + this->resolver_ = other.resolver_; + } + + OutputType(OutputType&& other) { + this->kind_ = other.kind_; + this->type_ = std::move(other.type_); + this->resolver_ = other.resolver_; + } + + OutputType& operator=(const OutputType&) = default; + OutputType& operator=(OutputType&&) = default; + + /// \brief Return the type of the expected output value of the kernel given + /// the input argument types. The resolver may make use of state information + /// kept in the KernelContext. + Result Resolve(KernelContext* ctx, + const std::vector& args) const; + + /// \brief The exact output value type for the FIXED kind. + const std::shared_ptr& type() const; + + /// \brief For use with COMPUTED resolution strategy. It may be more + /// convenient to invoke this with OutputType::Resolve returned from this + /// method. + const Resolver& resolver() const; + + /// \brief Render a human-readable string representation. + std::string ToString() const; + + /// \brief Return the kind of type resolution of this output type, whether + /// fixed/invariant or computed by a resolver. + ResolveKind kind() const { return kind_; } + + private: + ResolveKind kind_; + + // For FIXED resolution + std::shared_ptr type_; + + // For COMPUTED resolution + Resolver resolver_ = NULLPTR; +}; + +/// \brief Holds the input types and output type of the kernel. +/// +/// VarArgs functions with minimum N arguments should pass up to N input types to be +/// used to validate the input types of a function invocation. The first N-1 types +/// will be matched against the first N-1 arguments, and the last type will be +/// matched against the remaining arguments. +class ARROW_EXPORT KernelSignature { + public: + KernelSignature(std::vector in_types, OutputType out_type, + bool is_varargs = false); + + /// \brief Convenience ctor since make_shared can be awkward + static std::shared_ptr Make(std::vector in_types, + OutputType out_type, + bool is_varargs = false); + + /// \brief Return true if the signature if compatible with the list of input + /// value descriptors. + bool MatchesInputs(const std::vector& types) const; + + /// \brief Returns true if the input types of each signature are + /// equal. Well-formed functions should have a deterministic output type + /// given input types, but currently it is the responsibility of the + /// developer to ensure this. + bool Equals(const KernelSignature& other) const; + + bool operator==(const KernelSignature& other) const { return this->Equals(other); } + + bool operator!=(const KernelSignature& other) const { return !(*this == other); } + + /// \brief Compute a hash code for the signature + size_t Hash() const; + + /// \brief The input types for the kernel. For VarArgs functions, this should + /// generally contain a single validator to use for validating all of the + /// function arguments. + const std::vector& in_types() const { return in_types_; } + + /// \brief The output type for the kernel. Use Resolve to return the + /// exact output given input argument types, since many kernels' + /// output types depend on their input types (or their type + /// metadata). + const OutputType& out_type() const { return out_type_; } + + /// \brief Render a human-readable string representation + std::string ToString() const; + + bool is_varargs() const { return is_varargs_; } + + private: + std::vector in_types_; + OutputType out_type_; + bool is_varargs_; + + // For caching the hash code after it's computed the first time + mutable uint64_t hash_code_; +}; + +/// \brief A function may contain multiple variants of a kernel for a given +/// type combination for different SIMD levels. Based on the active system's +/// CPU info or the user's preferences, we can elect to use one over the other. +struct SimdLevel { + enum type { NONE = 0, SSE4_2, AVX, AVX2, AVX512, NEON, MAX }; +}; + +/// \brief The strategy to use for propagating or otherwise populating the +/// validity bitmap of a kernel output. +struct NullHandling { + enum type { + /// Compute the output validity bitmap by intersecting the validity bitmaps + /// of the arguments using bitwise-and operations. This means that values + /// in the output are valid/non-null only if the corresponding values in + /// all input arguments were valid/non-null. Kernel generally need not + /// touch the bitmap thereafter, but a kernel's exec function is permitted + /// to alter the bitmap after the null intersection is computed if it needs + /// to. + INTERSECTION, + + /// Kernel expects a pre-allocated buffer to write the result bitmap + /// into. The preallocated memory is not zeroed (except for the last byte), + /// so the kernel should ensure to completely populate the bitmap. + COMPUTED_PREALLOCATE, + + /// Kernel allocates and sets the validity bitmap of the output. + COMPUTED_NO_PREALLOCATE, + + /// Kernel output is never null and a validity bitmap does not need to be + /// allocated. + OUTPUT_NOT_NULL + }; +}; + +/// \brief The preference for memory preallocation of fixed-width type outputs +/// in kernel execution. +struct MemAllocation { + enum type { + // For data types that support pre-allocation (i.e. fixed-width), the + // kernel expects to be provided a pre-allocated data buffer to write + // into. Non-fixed-width types must always allocate their own data + // buffers. The allocation made for the same length as the execution batch, + // so vector kernels yielding differently sized output should not use this. + // + // It is valid for the data to not be preallocated but the validity bitmap + // is (or is computed using the intersection/bitwise-and method). + // + // For variable-size output types like BinaryType or StringType, or for + // nested types, this option has no effect. + PREALLOCATE, + + // The kernel is responsible for allocating its own data buffer for + // fixed-width type outputs. + NO_PREALLOCATE + }; +}; + +struct Kernel; + +/// \brief Arguments to pass to an KernelInit function. A struct is used to help +/// avoid API breakage should the arguments passed need to be expanded. +struct KernelInitArgs { + /// \brief A pointer to the kernel being initialized. The init function may + /// depend on the kernel's KernelSignature or other data contained there. + const Kernel* kernel; + + /// \brief The types of the input arguments that the kernel is + /// about to be executed against. + const std::vector& inputs; + + /// \brief Opaque options specific to this kernel. May be nullptr for functions + /// that do not require options. + const FunctionOptions* options; +}; + +/// \brief Common initializer function for all kernel types. +using KernelInit = std::function>( + KernelContext*, const KernelInitArgs&)>; + +/// \brief Base type for kernels. Contains the function signature and +/// optionally the state initialization function, along with some common +/// attributes +struct ARROW_EXPORT Kernel { + Kernel() = default; + + Kernel(std::shared_ptr sig, KernelInit init) + : signature(std::move(sig)), init(std::move(init)) {} + + Kernel(std::vector in_types, OutputType out_type, KernelInit init) + : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init)) {} + + /// \brief The "signature" of the kernel containing the InputType input + /// argument validators and OutputType output type resolver. + std::shared_ptr signature; + + /// \brief Create a new KernelState for invocations of this kernel, e.g. to + /// set up any options or state relevant for execution. + KernelInit init; + + /// \brief Create a vector of new KernelState for invocations of this kernel. + static Status InitAll(KernelContext*, const KernelInitArgs&, + std::vector>*); + + /// \brief Indicates whether execution can benefit from parallelization + /// (splitting large chunks into smaller chunks and using multiple + /// threads). Some kernels may not support parallel execution at + /// all. Synchronization and concurrency-related issues are currently the + /// responsibility of the Kernel's implementation. + bool parallelizable = true; + + /// \brief Indicates the level of SIMD instruction support in the host CPU is + /// required to use the function. The intention is for functions to be able to + /// contain multiple kernels with the same signature but different levels of SIMD, + /// so that the most optimized kernel supported on a host's processor can be chosen. + SimdLevel::type simd_level = SimdLevel::NONE; + + // Additional kernel-specific data + std::shared_ptr data; +}; + +/// \brief The scalar kernel execution API that must be implemented for SCALAR +/// kernel types. This includes both stateless and stateful kernels. Kernels +/// depending on some execution state access that state via subclasses of +/// KernelState set on the KernelContext object. Implementations should +/// endeavor to write into pre-allocated memory if they are able, though for +/// some kernels (e.g. in cases when a builder like StringBuilder) must be +/// employed this may not be possible. +using ArrayKernelExec = Status (*)(KernelContext*, const ExecSpan&, ExecResult*); + +/// \brief Kernel data structure for implementations of ScalarFunction. In +/// addition to the members found in Kernel, contains the null handling +/// and memory pre-allocation preferences. +struct ARROW_EXPORT ScalarKernel : public Kernel { + ScalarKernel() = default; + + ScalarKernel(std::shared_ptr sig, ArrayKernelExec exec, + KernelInit init = NULLPTR) + : Kernel(std::move(sig), init), exec(exec) {} + + ScalarKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, + KernelInit init = NULLPTR) + : Kernel(std::move(in_types), std::move(out_type), std::move(init)), exec(exec) {} + + /// \brief Perform a single invocation of this kernel. Depending on the + /// implementation, it may only write into preallocated memory, while in some + /// cases it will allocate its own memory. Any required state is managed + /// through the KernelContext. + ArrayKernelExec exec; + + /// \brief Writing execution results into larger contiguous allocations + /// requires that the kernel be able to write into sliced output ArrayData*, + /// including sliced output validity bitmaps. Some kernel implementations may + /// not be able to do this, so setting this to false disables this + /// functionality. + bool can_write_into_slices = true; + + // For scalar functions preallocated data and intersecting arg validity + // bitmaps is a reasonable default + NullHandling::type null_handling = NullHandling::INTERSECTION; + MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE; +}; + +// ---------------------------------------------------------------------- +// VectorKernel (for VectorFunction) + +/// \brief Kernel data structure for implementations of VectorFunction. In +/// contains an optional finalizer function, the null handling and memory +/// pre-allocation preferences (which have different defaults from +/// ScalarKernel), and some other execution-related options. +struct ARROW_EXPORT VectorKernel : public Kernel { + /// \brief See VectorKernel::finalize member for usage + using FinalizeFunc = std::function*)>; + + /// \brief Function for executing a stateful VectorKernel against a + /// ChunkedArray input. Does not need to be defined for all VectorKernels + using ChunkedExec = Status (*)(KernelContext*, const ExecBatch&, Datum* out); + + VectorKernel() = default; + + VectorKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, + KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR) + : Kernel(std::move(in_types), std::move(out_type), std::move(init)), + exec(exec), + finalize(std::move(finalize)) {} + + VectorKernel(std::shared_ptr sig, ArrayKernelExec exec, + KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR) + : Kernel(std::move(sig), std::move(init)), + exec(exec), + finalize(std::move(finalize)) {} + + /// \brief Perform a single invocation of this kernel. Any required state is + /// managed through the KernelContext. + ArrayKernelExec exec; + + /// \brief Execute the kernel on a ChunkedArray. Does not need to be defined + ChunkedExec exec_chunked = NULLPTR; + + /// \brief For VectorKernel, convert intermediate results into finalized + /// results. Mutates input argument. Some kernels may accumulate state + /// (example: hashing-related functions) through processing chunked inputs, and + /// then need to attach some accumulated state to each of the outputs of + /// processing each chunk of data. + FinalizeFunc finalize; + + /// Since vector kernels generally are implemented rather differently from + /// scalar/elementwise kernels (and they may not even yield arrays of the same + /// size), so we make the developer opt-in to any memory preallocation rather + /// than having to turn it off. + NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE; + + /// \brief Writing execution results into larger contiguous allocations + /// requires that the kernel be able to write into sliced output ArrayData*, + /// including sliced output validity bitmaps. Some kernel implementations may + /// not be able to do this, so setting this to false disables this + /// functionality. + bool can_write_into_slices = true; + + /// Some vector kernels can do chunkwise execution using ExecSpanIterator, + /// in some cases accumulating some state. Other kernels (like Take) need to + /// be passed whole arrays and don't work on ChunkedArray inputs + bool can_execute_chunkwise = true; + + /// Some kernels (like unique and value_counts) yield non-chunked output from + /// chunked-array inputs. This option controls how the results are boxed when + /// returned from ExecVectorFunction + /// + /// true -> ChunkedArray + /// false -> Array + bool output_chunked = true; +}; + +// ---------------------------------------------------------------------- +// ScalarAggregateKernel (for ScalarAggregateFunction) + +using ScalarAggregateConsume = Status (*)(KernelContext*, const ExecSpan&); +using ScalarAggregateMerge = Status (*)(KernelContext*, KernelState&&, KernelState*); +// Finalize returns Datum to permit multiple return values +using ScalarAggregateFinalize = Status (*)(KernelContext*, Datum*); + +/// \brief Kernel data structure for implementations of +/// ScalarAggregateFunction. The four necessary components of an aggregation +/// kernel are the init, consume, merge, and finalize functions. +/// +/// * init: creates a new KernelState for a kernel. +/// * consume: processes an ExecSpan and updates the KernelState found in the +/// KernelContext. +/// * merge: combines one KernelState with another. +/// * finalize: produces the end result of the aggregation using the +/// KernelState in the KernelContext. +struct ARROW_EXPORT ScalarAggregateKernel : public Kernel { + ScalarAggregateKernel(std::shared_ptr sig, KernelInit init, + ScalarAggregateConsume consume, ScalarAggregateMerge merge, + ScalarAggregateFinalize finalize, const bool ordered) + : Kernel(std::move(sig), std::move(init)), + consume(consume), + merge(merge), + finalize(finalize), + ordered(ordered) {} + + ScalarAggregateKernel(std::vector in_types, OutputType out_type, + KernelInit init, ScalarAggregateConsume consume, + ScalarAggregateMerge merge, ScalarAggregateFinalize finalize, + const bool ordered) + : ScalarAggregateKernel( + KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init), consume, merge, finalize, ordered) {} + + /// \brief Merge a vector of KernelStates into a single KernelState. + /// The merged state will be returned and will be set on the KernelContext. + static Result> MergeAll( + const ScalarAggregateKernel* kernel, KernelContext* ctx, + std::vector> states); + + ScalarAggregateConsume consume; + ScalarAggregateMerge merge; + ScalarAggregateFinalize finalize; + /// \brief Whether this kernel requires ordering + /// Some aggregations, such as, "first", requires some kind of input order. The + /// order can be implicit, e.g., the order of the input data, or explicit, e.g. + /// the ordering specified with a window aggregation. + /// The caller of the aggregate kernel is responsible for passing data in some + /// defined order to the kernel. The flag here is a way for the kernel to tell + /// the caller that data passed to the kernel must be defined in some order. + bool ordered = false; +}; + +// ---------------------------------------------------------------------- +// HashAggregateKernel (for HashAggregateFunction) + +using HashAggregateResize = Status (*)(KernelContext*, int64_t); +using HashAggregateConsume = Status (*)(KernelContext*, const ExecSpan&); +using HashAggregateMerge = Status (*)(KernelContext*, KernelState&&, const ArrayData&); + +// Finalize returns Datum to permit multiple return values +using HashAggregateFinalize = Status (*)(KernelContext*, Datum*); + +/// \brief Kernel data structure for implementations of +/// HashAggregateFunction. The four necessary components of an aggregation +/// kernel are the init, consume, merge, and finalize functions. +/// +/// * init: creates a new KernelState for a kernel. +/// * resize: ensure that the KernelState can accommodate the specified number of groups. +/// * consume: processes an ExecSpan (which includes the argument as well +/// as an array of group identifiers) and updates the KernelState found in the +/// KernelContext. +/// * merge: combines one KernelState with another. +/// * finalize: produces the end result of the aggregation using the +/// KernelState in the KernelContext. +struct ARROW_EXPORT HashAggregateKernel : public Kernel { + HashAggregateKernel() = default; + + HashAggregateKernel(std::shared_ptr sig, KernelInit init, + HashAggregateResize resize, HashAggregateConsume consume, + HashAggregateMerge merge, HashAggregateFinalize finalize, + const bool ordered) + : Kernel(std::move(sig), std::move(init)), + resize(resize), + consume(consume), + merge(merge), + finalize(finalize), + ordered(ordered) {} + + HashAggregateKernel(std::vector in_types, OutputType out_type, + KernelInit init, HashAggregateConsume consume, + HashAggregateResize resize, HashAggregateMerge merge, + HashAggregateFinalize finalize, const bool ordered) + : HashAggregateKernel( + KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init), resize, consume, merge, finalize, ordered) {} + + HashAggregateResize resize; + HashAggregateConsume consume; + HashAggregateMerge merge; + HashAggregateFinalize finalize; + /// @brief whether the summarizer requires ordering + /// This is similar to ScalarAggregateKernel. See ScalarAggregateKernel + /// for detailed doc of this variable. + bool ordered = false; +}; + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/ordering.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/ordering.h new file mode 100644 index 0000000000000000000000000000000000000000..61caa2b570dd31dc988d34406f9b05c3573333e2 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/ordering.h @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/type.h" +#include "arrow/util/compare.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +enum class SortOrder { + /// Arrange values in increasing order + Ascending, + /// Arrange values in decreasing order + Descending, +}; + +enum class NullPlacement { + /// Place nulls and NaNs before any non-null values. + /// NaNs will come after nulls. + AtStart, + /// Place nulls and NaNs after any non-null values. + /// NaNs will come before nulls. + AtEnd, +}; + +/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices +class ARROW_EXPORT SortKey : public util::EqualityComparable { + public: + explicit SortKey(FieldRef target, SortOrder order = SortOrder::Ascending) + : target(std::move(target)), order(order) {} + + bool Equals(const SortKey& other) const; + std::string ToString() const; + + /// A FieldRef targeting the sort column. + FieldRef target; + /// How to order by this sort key. + SortOrder order; +}; + +class ARROW_EXPORT Ordering : public util::EqualityComparable { + public: + Ordering(std::vector sort_keys, + NullPlacement null_placement = NullPlacement::AtStart) + : sort_keys_(std::move(sort_keys)), null_placement_(null_placement) {} + /// true if data ordered by other is also ordered by this + /// + /// For example, if data is ordered by [a, b, c] then it is also ordered + /// by [a, b] but not by [b, c] or [a, b, c, d]. + /// + /// [a, b].IsSuborderOf([a, b, c]) - true + /// [a, b, c].IsSuborderOf([a, b, c]) - true + /// [b, c].IsSuborderOf([a, b, c]) - false + /// [a, b, c, d].IsSuborderOf([a, b, c]) - false + /// + /// The implicit ordering is not a suborder of any other ordering and + /// no other ordering is a suborder of it. The implicit ordering is not a + /// suborder of itself. + /// + /// The unordered ordering is a suborder of all other orderings but no + /// other ordering is a suborder of it. The unordered ordering is a suborder + /// of itself. + /// + /// The unordered ordering is a suborder of the implicit ordering. + bool IsSuborderOf(const Ordering& other) const; + + bool Equals(const Ordering& other) const; + std::string ToString() const; + + bool is_implicit() const { return is_implicit_; } + bool is_unordered() const { return !is_implicit_ && sort_keys_.empty(); } + + const std::vector& sort_keys() const { return sort_keys_; } + NullPlacement null_placement() const { return null_placement_; } + + static const Ordering& Implicit() { + static const Ordering kImplicit(true); + return kImplicit; + } + + static const Ordering& Unordered() { + static const Ordering kUnordered(false); + // It is also possible to get an unordered ordering by passing in an empty vector + // using the normal constructor. This is ok and useful when ordering comes from user + // input. + return kUnordered; + } + + private: + explicit Ordering(bool is_implicit) + : null_placement_(NullPlacement::AtStart), is_implicit_(is_implicit) {} + /// Column key(s) to order by and how to order by these sort keys. + std::vector sort_keys_; + /// Whether nulls and NaNs are placed at the start or at the end + NullPlacement null_placement_; + bool is_implicit_ = false; +}; + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/registry.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/registry.h new file mode 100644 index 0000000000000000000000000000000000000000..f31c4c1ba5920626578a4e4170e3cd2d28288545 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/registry.h @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle + +#pragma once + +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +class Function; +class FunctionOptionsType; + +/// \brief A mutable central function registry for built-in functions as well +/// as user-defined functions. Functions are implementations of +/// arrow::compute::Function. +/// +/// Generally, each function contains kernels which are implementations of a +/// function for a specific argument signature. After looking up a function in +/// the registry, one can either execute it eagerly with Function::Execute or +/// use one of the function's dispatch methods to pick a suitable kernel for +/// lower-level function execution. +class ARROW_EXPORT FunctionRegistry { + public: + ~FunctionRegistry(); + + /// \brief Construct a new registry. + /// + /// Most users only need to use the global registry. + static std::unique_ptr Make(); + + /// \brief Construct a new nested registry with the given parent. + /// + /// Most users only need to use the global registry. The returned registry never changes + /// its parent, even when an operation allows overwriting. + static std::unique_ptr Make(FunctionRegistry* parent); + + /// \brief Check whether a new function can be added to the registry. + /// + /// \returns Status::KeyError if a function with the same name is already registered. + Status CanAddFunction(std::shared_ptr function, bool allow_overwrite = false); + + /// \brief Add a new function to the registry. + /// + /// \returns Status::KeyError if a function with the same name is already registered. + Status AddFunction(std::shared_ptr function, bool allow_overwrite = false); + + /// \brief Check whether an alias can be added for the given function name. + /// + /// \returns Status::KeyError if the function with the given name is not registered. + Status CanAddAlias(const std::string& target_name, const std::string& source_name); + + /// \brief Add alias for the given function name. + /// + /// \returns Status::KeyError if the function with the given name is not registered. + Status AddAlias(const std::string& target_name, const std::string& source_name); + + /// \brief Check whether a new function options type can be added to the registry. + /// + /// \return Status::KeyError if a function options type with the same name is already + /// registered. + Status CanAddFunctionOptionsType(const FunctionOptionsType* options_type, + bool allow_overwrite = false); + + /// \brief Add a new function options type to the registry. + /// + /// \returns Status::KeyError if a function options type with the same name is already + /// registered. + Status AddFunctionOptionsType(const FunctionOptionsType* options_type, + bool allow_overwrite = false); + + /// \brief Retrieve a function by name from the registry. + Result> GetFunction(const std::string& name) const; + + /// \brief Return vector of all entry names in the registry. + /// + /// Helpful for displaying a manifest of available functions. + std::vector GetFunctionNames() const; + + /// \brief Retrieve a function options type by name from the registry. + Result GetFunctionOptionsType( + const std::string& name) const; + + /// \brief The number of currently registered functions. + int num_functions() const; + + /// \brief The cast function object registered in AddFunction. + /// + /// Helpful for get cast function as needed. + const Function* cast_function() const; + + private: + FunctionRegistry(); + + // Use PIMPL pattern to not have std::unordered_map here + class FunctionRegistryImpl; + std::unique_ptr impl_; + + explicit FunctionRegistry(FunctionRegistryImpl* impl); +}; + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/row/grouper.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/row/grouper.h new file mode 100644 index 0000000000000000000000000000000000000000..345bc62924241d181d383d74c22a9bdef6228059 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/row/grouper.h @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/compute/kernel.h" +#include "arrow/datum.h" +#include "arrow/result.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +/// \brief A segment +/// A segment group is a chunk of continuous rows that have the same segment key. (For +/// example, in ordered time series processing, segment key can be "date", and a segment +/// group can be all the rows that belong to the same date.) A segment group can span +/// across multiple exec batches. A segment is a chunk of continuous rows that has the +/// same segment key within a given batch. When a segment group span cross batches, it +/// will have multiple segments. A segment never spans cross batches. The segment data +/// structure only makes sense when used along with a exec batch. +struct ARROW_EXPORT Segment { + /// \brief the offset into the batch where the segment starts + int64_t offset; + /// \brief the length of the segment + int64_t length; + /// \brief whether the segment may be extended by a next one + bool is_open; + /// \brief whether the segment extends a preceeding one + bool extends; +}; + +inline bool operator==(const Segment& segment1, const Segment& segment2) { + return segment1.offset == segment2.offset && segment1.length == segment2.length && + segment1.is_open == segment2.is_open && segment1.extends == segment2.extends; +} +inline bool operator!=(const Segment& segment1, const Segment& segment2) { + return !(segment1 == segment2); +} + +/// \brief a helper class to divide a batch into segments of equal values +/// +/// For example, given a batch with two columns specifed as segment keys: +/// +/// A A [other columns]... +/// A A ... +/// A B ... +/// A B ... +/// A A ... +/// +/// Then the batch could be divided into 3 segments. The first would be rows 0 & 1, +/// the second would be rows 2 & 3, and the third would be row 4. +/// +/// Further, a segmenter keeps track of the last value seen. This allows it to calculate +/// segments which span batches. In our above example the last batch we emit would set +/// the "open" flag, which indicates whether the segment may extend into the next batch. +/// +/// If the next call to the segmenter starts with `A A` then that segment would set the +/// "extends" flag, which indicates whether the segment continues the last open batch. +class ARROW_EXPORT RowSegmenter { + public: + virtual ~RowSegmenter() = default; + + /// \brief Construct a Segmenter which segments on the specified key types + /// + /// \param[in] key_types the specified key types + /// \param[in] nullable_keys whether values of the specified keys may be null + /// \param[in] ctx the execution context to use + static Result> Make( + const std::vector& key_types, bool nullable_keys, ExecContext* ctx); + + /// \brief Return the key types of this segmenter + virtual const std::vector& key_types() const = 0; + + /// \brief Reset this segmenter + /// + /// A segmenter normally extends (see `Segment`) a segment from one batch to the next. + /// If segment-extension is undesirable, for example when each batch is processed + /// independently, then `Reset` should be invoked before processing the next batch. + virtual Status Reset() = 0; + + /// \brief Get the next segment for the given batch starting from the given offset + /// DEPRECATED: Due to its inefficiency, use GetSegments instead. + ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.") + virtual Result GetNextSegment(const ExecSpan& batch, int64_t offset) = 0; + + /// \brief Get all segments for the given batch + virtual Result> GetSegments(const ExecSpan& batch) = 0; +}; + +/// Consumes batches of keys and yields batches of the group ids. +class ARROW_EXPORT Grouper { + public: + virtual ~Grouper() = default; + + /// Construct a Grouper which receives the specified key types + static Result> Make(const std::vector& key_types, + ExecContext* ctx = default_exec_context()); + + /// Reset all intermediate state, make the grouper logically as just `Make`ed. + /// The underlying buffers, if any, may or may not be released though. + virtual Status Reset() = 0; + + /// Consume a batch of keys, producing the corresponding group ids as an integer array, + /// over a slice defined by an offset and length, which defaults to the batch length. + /// Currently only uint32 indices will be produced, eventually the bit width will only + /// be as wide as necessary. + virtual Result Consume(const ExecSpan& batch, int64_t offset = 0, + int64_t length = -1) = 0; + + /// Get current unique keys. May be called multiple times. + virtual Result GetUniques() = 0; + + /// Get the current number of groups. + virtual uint32_t num_groups() const = 0; + + /// \brief Assemble lists of indices of identical elements. + /// + /// \param[in] ids An unsigned, all-valid integral array which will be + /// used as grouping criteria. + /// \param[in] num_groups An upper bound for the elements of ids + /// \param[in] ctx Execution context to use during the operation + /// \return A num_groups-long ListArray where the slot at i contains a + /// list of indices where i appears in ids. + /// + /// MakeGroupings([ + /// 2, + /// 2, + /// 5, + /// 5, + /// 2, + /// 3 + /// ], 8) == [ + /// [], + /// [], + /// [0, 1, 4], + /// [5], + /// [], + /// [2, 3], + /// [], + /// [] + /// ] + static Result> MakeGroupings( + const UInt32Array& ids, uint32_t num_groups, + ExecContext* ctx = default_exec_context()); + + /// \brief Produce a ListArray whose slots are selections of `array` which correspond to + /// the provided groupings. + /// + /// For example, + /// ApplyGroupings([ + /// [], + /// [], + /// [0, 1, 4], + /// [5], + /// [], + /// [2, 3], + /// [], + /// [] + /// ], [2, 2, 5, 5, 2, 3]) == [ + /// [], + /// [], + /// [2, 2, 2], + /// [3], + /// [], + /// [5, 5], + /// [], + /// [] + /// ] + static Result> ApplyGroupings( + const ListArray& groupings, const Array& array, + ExecContext* ctx = default_exec_context()); +}; + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/type_fwd.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..89f32ceb0f906e0d50bf063da22f33c3a856fe5d --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/type_fwd.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/visibility.h" + +namespace arrow { + +struct Datum; +struct TypeHolder; + +namespace compute { + +class Function; +class ScalarAggregateFunction; +class FunctionExecutor; +class FunctionOptions; +class FunctionRegistry; + +/// \brief Return the process-global function registry. +// Defined in registry.cc +ARROW_EXPORT FunctionRegistry* GetFunctionRegistry(); + +class CastOptions; + +struct ExecBatch; +class ExecContext; +class KernelContext; + +struct Kernel; +struct ScalarKernel; +struct ScalarAggregateKernel; +struct VectorKernel; + +struct KernelState; + +class Expression; + +ARROW_EXPORT ExecContext* default_exec_context(); +ARROW_EXPORT ExecContext* threaded_exec_context(); + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/util.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/util.h new file mode 100644 index 0000000000000000000000000000000000000000..1aaff43e10e1fd6b10a1e05eb1d33039b55b8563 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/compute/util.h @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/compute/expression.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/util/cpu_info.h" +#include "arrow/util/simd.h" + +#if defined(__clang__) || defined(__GNUC__) +# define BYTESWAP(x) __builtin_bswap64(x) +# define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31))) +# define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63))) +#elif defined(_MSC_VER) +# include +# define BYTESWAP(x) _byteswap_uint64(x) +# define ROTL(x, n) _rotl((x), (n)) +# define ROTL64(x, n) _rotl64((x), (n)) +#endif + +namespace arrow { +namespace util { + +// Some platforms typedef int64_t as long int instead of long long int, +// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics +// which need long long. +// We use the cast to the type below in these intrinsics to make the code +// compile in all cases. +// +using int64_for_gather_t = const long long int; // NOLINT runtime-int + +// All MiniBatch... classes use TempVectorStack for vector allocations and can +// only work with vectors up to 1024 elements. +// +// They should only be allocated on the stack to guarantee the right sequence +// of allocation and deallocation of vectors from TempVectorStack. +// +class MiniBatch { + public: + static constexpr int kLogMiniBatchLength = 10; + static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; +}; + +namespace bit_util { + +ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + int* num_indexes, uint16_t* indexes, + int bit_offset = 0); + +ARROW_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, int bit_offset = 0); + +// Input and output indexes may be pointing to the same data (in-place filtering). +ARROW_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, int* num_indexes_bit0, + uint16_t* indexes_bit0, uint16_t* indexes_bit1, + int bit_offset = 0); + +// Bit 1 is replaced with byte 0xFF. +ARROW_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); + +// Return highest bit of each byte. +ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits, + const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); + +ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes); + +#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2) +// The functions below use BMI2 instructions, be careful before calling! + +namespace avx2 { +ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); +ARROW_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, int* num_indexes, + uint16_t* indexes, uint16_t base_index = 0); +ARROW_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, + uint8_t* bytes); +ARROW_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, + uint8_t* bits); +ARROW_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); +} // namespace avx2 + +#endif + +} // namespace bit_util +} // namespace util + +namespace compute { + +/// Modify an Expression with pre-order and post-order visitation. +/// `pre` will be invoked on each Expression. `pre` will visit Calls before their +/// arguments, `post_call` will visit Calls (and no other Expressions) after their +/// arguments. Visitors should return the Identical expression to indicate no change; this +/// will prevent unnecessary construction in the common case where a modification is not +/// possible/necessary/... +/// +/// If an argument was modified, `post_call` visits a reconstructed Call with the modified +/// arguments but also receives a pointer to the unmodified Expression as a second +/// argument. If no arguments were modified the unmodified Expression* will be nullptr. +template +Result ModifyExpression(Expression expr, const PreVisit& pre, + const PostVisitCall& post_call) { + ARROW_ASSIGN_OR_RAISE(expr, Result(pre(std::move(expr)))); + + auto call = expr.call(); + if (!call) return expr; + + bool at_least_one_modified = false; + std::vector modified_arguments; + + for (size_t i = 0; i < call->arguments.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(auto modified_argument, + ModifyExpression(call->arguments[i], pre, post_call)); + + if (Identical(modified_argument, call->arguments[i])) { + continue; + } + + if (!at_least_one_modified) { + modified_arguments = call->arguments; + at_least_one_modified = true; + } + + modified_arguments[i] = std::move(modified_argument); + } + + if (at_least_one_modified) { + // reconstruct the call expression with the modified arguments + auto modified_call = *call; + modified_call.arguments = std::move(modified_arguments); + return post_call(Expression(std::move(modified_call)), &expr); + } + + return post_call(std::move(expr), NULLPTR); +} + +// Helper class to calculate the modified number of rows to process using SIMD. +// +// Some array elements at the end will be skipped in order to avoid buffer +// overrun, when doing memory loads and stores using larger word size than a +// single array element. +// +class TailSkipForSIMD { + public: + static int64_t FixBitAccess(int num_bytes_accessed_together, int64_t num_rows, + int bit_offset) { + int64_t num_bytes = bit_util::BytesForBits(num_rows + bit_offset); + int64_t num_bytes_safe = + std::max(static_cast(0LL), num_bytes - num_bytes_accessed_together + 1); + int64_t num_rows_safe = + std::max(static_cast(0LL), 8 * num_bytes_safe - bit_offset); + return std::min(num_rows_safe, num_rows); + } + static int64_t FixBinaryAccess(int num_bytes_accessed_together, int64_t num_rows, + int64_t length) { + int64_t num_rows_to_skip = bit_util::CeilDiv(length, num_bytes_accessed_together); + int64_t num_rows_safe = + std::max(static_cast(0LL), num_rows - num_rows_to_skip); + return num_rows_safe; + } + static int64_t FixVarBinaryAccess(int num_bytes_accessed_together, int64_t num_rows, + const uint32_t* offsets) { + // Do not process rows that could read past the end of the buffer using N + // byte loads/stores. + // + int64_t num_rows_safe = num_rows; + while (num_rows_safe > 0 && + offsets[num_rows_safe] + num_bytes_accessed_together > offsets[num_rows]) { + --num_rows_safe; + } + return num_rows_safe; + } + static int FixSelection(int64_t num_rows_safe, int num_selected, + const uint16_t* selection) { + int num_selected_safe = num_selected; + while (num_selected_safe > 0 && selection[num_selected_safe - 1] >= num_rows_safe) { + --num_selected_safe; + } + return num_selected_safe; + } +}; + +} // namespace compute +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/config.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/config.h new file mode 100644 index 0000000000000000000000000000000000000000..617d6c268b55ea344a3fe7f96141ff0f7e4d3f88 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/config.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/config.h" // IWYU pragma: export +#include "arrow/util/visibility.h" + +namespace arrow { + +struct BuildInfo { + /// The packed version number, e.g. 1002003 (decimal) for Arrow 1.2.3 + int version; + /// The "major" version number, e.g. 1 for Arrow 1.2.3 + int version_major; + /// The "minor" version number, e.g. 2 for Arrow 1.2.3 + int version_minor; + /// The "patch" version number, e.g. 3 for Arrow 1.2.3 + int version_patch; + /// The version string, e.g. "1.2.3" + std::string version_string; + std::string so_version; + std::string full_so_version; + + /// The CMake compiler identifier, e.g. "GNU" + std::string compiler_id; + std::string compiler_version; + std::string compiler_flags; + + /// The git changeset id, if available + std::string git_id; + /// The git changeset description, if available + std::string git_description; + std::string package_kind; + + /// The uppercase build type, e.g. "DEBUG" or "RELEASE" + std::string build_type; +}; + +struct RuntimeInfo { + /// The enabled SIMD level + /// + /// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL + /// environment variable is set to another value. + std::string simd_level; + + /// The SIMD level available on the OS and CPU + std::string detected_simd_level; + + /// Whether using the OS-based timezone database + /// This is set at compile-time. + bool using_os_timezone_db; + + /// The path to the timezone database; by default None. + std::optional timezone_db_path; +}; + +/// \brief Get runtime build info. +/// +/// The returned values correspond to exact loaded version of the Arrow library, +/// rather than the values frozen at application compile-time through the `ARROW_*` +/// preprocessor definitions. +ARROW_EXPORT +const BuildInfo& GetBuildInfo(); + +/// \brief Get runtime info. +/// +ARROW_EXPORT +RuntimeInfo GetRuntimeInfo(); + +struct GlobalOptions { + /// Path to text timezone database. This is only configurable on Windows, + /// which does not have a compatible OS timezone database. + std::optional timezone_db_path; +}; + +ARROW_EXPORT +Status Initialize(const GlobalOptions& options) noexcept; + +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/api.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/api.h new file mode 100644 index 0000000000000000000000000000000000000000..4af1835cd709d43e0abe3b39b46531cae9a047fc --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/api.h @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/csv/options.h" +#include "arrow/csv/reader.h" +#include "arrow/csv/writer.h" diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/chunker.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/chunker.h new file mode 100644 index 0000000000000000000000000000000000000000..662b16ec40a9485547ce01b32ea0325a23122711 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/chunker.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/status.h" +#include "arrow/util/delimiting.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +ARROW_EXPORT +std::unique_ptr MakeChunker(const ParseOptions& options); + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_builder.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..07279db313e92d2daeb93be12d0ab307d0c25201 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_builder.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +class BlockParser; +struct ConvertOptions; + +class ARROW_EXPORT ColumnBuilder { + public: + virtual ~ColumnBuilder() = default; + + /// Spawn a task that will try to convert and append the given CSV block. + /// All calls to Append() should happen on the same thread, otherwise + /// call Insert() instead. + virtual void Append(const std::shared_ptr& parser) = 0; + + /// Spawn a task that will try to convert and insert the given CSV block + virtual void Insert(int64_t block_index, + const std::shared_ptr& parser) = 0; + + /// Return the final chunked array. The TaskGroup _must_ have finished! + virtual Result> Finish() = 0; + + std::shared_ptr task_group() { return task_group_; } + + /// Construct a strictly-typed ColumnBuilder. + static Result> Make( + MemoryPool* pool, const std::shared_ptr& type, int32_t col_index, + const ConvertOptions& options, + const std::shared_ptr& task_group); + + /// Construct a type-inferring ColumnBuilder. + static Result> Make( + MemoryPool* pool, int32_t col_index, const ConvertOptions& options, + const std::shared_ptr& task_group); + + /// Construct a ColumnBuilder for a column of nulls + /// (i.e. not present in the CSV file). + static Result> MakeNull( + MemoryPool* pool, const std::shared_ptr& type, + const std::shared_ptr& task_group); + + protected: + explicit ColumnBuilder(std::shared_ptr task_group) + : task_group_(std::move(task_group)) {} + + std::shared_ptr task_group_; +}; + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_decoder.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_decoder.h new file mode 100644 index 0000000000000000000000000000000000000000..5fbbd5df58b1c588b88e16b68da50b9399211abc --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_decoder.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +class BlockParser; +struct ConvertOptions; + +class ARROW_EXPORT ColumnDecoder { + public: + virtual ~ColumnDecoder() = default; + + /// Spawn a task that will try to convert and insert the given CSV block + virtual Future> Decode( + const std::shared_ptr& parser) = 0; + + /// Construct a strictly-typed ColumnDecoder. + static Result> Make(MemoryPool* pool, + std::shared_ptr type, + int32_t col_index, + const ConvertOptions& options); + + /// Construct a type-inferring ColumnDecoder. + /// Inference will run only on the first block, the type will be frozen afterwards. + static Result> Make(MemoryPool* pool, int32_t col_index, + const ConvertOptions& options); + + /// Construct a ColumnDecoder for a column of nulls + /// (i.e. not present in the CSV file). + static Result> MakeNull(MemoryPool* pool, + std::shared_ptr type); + + protected: + ColumnDecoder() = default; +}; + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/converter.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/converter.h new file mode 100644 index 0000000000000000000000000000000000000000..639f692f26a1ba3a134caac68a432ac22f068917 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/converter.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +class BlockParser; + +class ARROW_EXPORT Converter { + public: + Converter(const std::shared_ptr& type, const ConvertOptions& options, + MemoryPool* pool); + virtual ~Converter() = default; + + virtual Result> Convert(const BlockParser& parser, + int32_t col_index) = 0; + + std::shared_ptr type() const { return type_; } + + // Create a Converter for the given data type + static Result> Make( + const std::shared_ptr& type, const ConvertOptions& options, + MemoryPool* pool = default_memory_pool()); + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(Converter); + + virtual Status Initialize() = 0; + + // CAUTION: ConvertOptions can grow large (if it customizes hundreds or + // thousands of columns), so avoid copying it in each Converter. + const ConvertOptions& options_; + MemoryPool* pool_; + std::shared_ptr type_; +}; + +class ARROW_EXPORT DictionaryConverter : public Converter { + public: + DictionaryConverter(const std::shared_ptr& value_type, + const ConvertOptions& options, MemoryPool* pool); + + // If the dictionary length goes above this value, conversion will fail + // with Status::IndexError. + virtual void SetMaxCardinality(int32_t max_length) = 0; + + // Create a Converter for the given dictionary value type. + // The dictionary index type will always be Int32. + static Result> Make( + const std::shared_ptr& value_type, const ConvertOptions& options, + MemoryPool* pool = default_memory_pool()); + + protected: + std::shared_ptr value_type_; +}; + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/invalid_row.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/invalid_row.h new file mode 100644 index 0000000000000000000000000000000000000000..4360ceaaea6ac07dd218c93ce13c3ab14c16fc63 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/invalid_row.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace arrow { +namespace csv { + +/// \brief Description of an invalid row +struct InvalidRow { + /// \brief Number of columns expected in the row + int32_t expected_columns; + /// \brief Actual number of columns found in the row + int32_t actual_columns; + /// \brief The physical row number if known or -1 + /// + /// This number is one-based and also accounts for non-data rows (such as + /// CSV header rows). + int64_t number; + /// \brief View of the entire row. Memory will be freed after callback returns + const std::string_view text; +}; + +/// \brief Result returned by an InvalidRowHandler +enum class InvalidRowResult { + // Generate an error describing this row + Error, + // Skip over this row + Skip +}; + +/// \brief callback for handling a row with an invalid number of columns while parsing +/// \return result indicating if an error should be returned from the parser or the row is +/// skipped +using InvalidRowHandler = std::function; + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/options.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/options.h new file mode 100644 index 0000000000000000000000000000000000000000..7723dcedc611e922c932d5f9e09e984044ab3c21 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/options.h @@ -0,0 +1,220 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/csv/invalid_row.h" +#include "arrow/csv/type_fwd.h" +#include "arrow/io/interfaces.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class DataType; +class TimestampParser; + +namespace csv { + +// Silly workaround for https://github.com/michaeljones/breathe/issues/453 +constexpr char kDefaultEscapeChar = '\\'; + +struct ARROW_EXPORT ParseOptions { + // Parsing options + + /// Field delimiter + char delimiter = ','; + /// Whether quoting is used + bool quoting = true; + /// Quoting character (if `quoting` is true) + char quote_char = '"'; + /// Whether a quote inside a value is double-quoted + bool double_quote = true; + /// Whether escaping is used + bool escaping = false; + /// Escaping character (if `escaping` is true) + char escape_char = kDefaultEscapeChar; + /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters + bool newlines_in_values = false; + /// Whether empty lines are ignored. If false, an empty line represents + /// a single empty value (assuming a one-column CSV file). + bool ignore_empty_lines = true; + /// A handler function for rows which do not have the correct number of columns + InvalidRowHandler invalid_row_handler; + + /// Create parsing options with default values + static ParseOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; +}; + +struct ARROW_EXPORT ConvertOptions { + // Conversion options + + /// Whether to check UTF8 validity of string columns + bool check_utf8 = true; + /// Optional per-column types (disabling type inference on those columns) + std::unordered_map> column_types; + /// Recognized spellings for null values + std::vector null_values; + /// Recognized spellings for boolean true values + std::vector true_values; + /// Recognized spellings for boolean false values + std::vector false_values; + + /// Whether string / binary columns can have null values. + /// + /// If true, then strings in "null_values" are considered null for string columns. + /// If false, then all strings are valid string values. + bool strings_can_be_null = false; + + /// Whether quoted values can be null. + /// + /// If true, then strings in "null_values" are also considered null when they + /// appear quoted in the CSV file. Otherwise, quoted values are never considered null. + bool quoted_strings_can_be_null = true; + + /// Whether to try to automatically dict-encode string / binary data. + /// If true, then when type inference detects a string or binary column, + /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values + /// (per chunk), after which it switches to regular encoding. + /// + /// This setting is ignored for non-inferred columns (those in `column_types`). + bool auto_dict_encode = false; + int32_t auto_dict_max_cardinality = 50; + + /// Decimal point character for floating-point and decimal data + char decimal_point = '.'; + + // XXX Should we have a separate FilterOptions? + + /// If non-empty, indicates the names of columns from the CSV file that should + /// be actually read and converted (in the vector's order). + /// Columns not in this vector will be ignored. + std::vector include_columns; + /// If false, columns in `include_columns` but not in the CSV file will error out. + /// If true, columns in `include_columns` but not in the CSV file will produce + /// a column of nulls (whose type is selected using `column_types`, + /// or null by default) + /// This option is ignored if `include_columns` is empty. + bool include_missing_columns = false; + + /// User-defined timestamp parsers, using the virtual parser interface in + /// arrow/util/value_parsing.h. More than one parser can be specified, and + /// the CSV conversion logic will try parsing values starting from the + /// beginning of this vector. If no parsers are specified, we use the default + /// built-in ISO-8601 parser. + std::vector> timestamp_parsers; + + /// Create conversion options with default values, including conventional + /// values for `null_values`, `true_values` and `false_values` + static ConvertOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; +}; + +struct ARROW_EXPORT ReadOptions { + // Reader options + + /// Whether to use the global CPU thread pool + bool use_threads = true; + + /// \brief Block size we request from the IO layer. + /// + /// This will determine multi-threading granularity as well as + /// the size of individual record batches. + /// Minimum valid value for block size is 1 + int32_t block_size = 1 << 20; // 1 MB + + /// Number of header rows to skip (not including the row of column names, if any) + int32_t skip_rows = 0; + + /// Number of rows to skip after the column names are read, if any + int32_t skip_rows_after_names = 0; + + /// Column names for the target table. + /// If empty, fall back on autogenerate_column_names. + std::vector column_names; + + /// Whether to autogenerate column names if `column_names` is empty. + /// If true, column names will be of the form "f0", "f1"... + /// If false, column names will be read from the first CSV row after `skip_rows`. + bool autogenerate_column_names = false; + + /// Create read options with default values + static ReadOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; +}; + +/// \brief Quoting style for CSV writing +enum class ARROW_EXPORT QuotingStyle { + /// Only enclose values in quotes which need them, because their CSV rendering can + /// contain quotes itself (e.g. strings or binary values) + Needed, + /// Enclose all valid values in quotes. Nulls are not quoted. May cause readers to + /// interpret all values as strings if schema is inferred. + AllValid, + /// Do not enclose any values in quotes. Prevents values from containing quotes ("), + /// cell delimiters (,) or line endings (\\r, \\n), (following RFC4180). If values + /// contain these characters, an error is caused when attempting to write. + None +}; + +struct ARROW_EXPORT WriteOptions { + /// Whether to write an initial header line with column names + bool include_header = true; + + /// \brief Maximum number of rows processed at a time + /// + /// The CSV writer converts and writes data in batches of N rows. + /// This number can impact performance. + int32_t batch_size = 1024; + + /// Field delimiter + char delimiter = ','; + + /// \brief The string to write for null values. Quotes are not allowed in this string. + std::string null_string; + + /// \brief IO context for writing. + io::IOContext io_context; + + /// \brief The end of line character to use for ending rows + std::string eol = "\n"; + + /// \brief Quoting style + QuotingStyle quoting_style = QuotingStyle::Needed; + + /// Create write options with default values + static WriteOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; +}; + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/parser.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/parser.h new file mode 100644 index 0000000000000000000000000000000000000000..c73e52ce831ed95b4abe83084b483c15660bae7e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/parser.h @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/csv/options.h" +#include "arrow/csv/type_fwd.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; + +namespace csv { + +/// Skip at most num_rows from the given input. The input pointer is updated +/// and the number of actually skipped rows is returns (may be less than +/// requested if the input is too short). +ARROW_EXPORT +int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows, + const uint8_t** out_data); + +class BlockParserImpl; + +namespace detail { + +struct ParsedValueDesc { + uint32_t offset : 31; + bool quoted : 1; +}; + +class ARROW_EXPORT DataBatch { + public: + explicit DataBatch(int32_t num_cols) : num_cols_(num_cols) {} + + /// \brief Return the number of parsed rows (not skipped) + int32_t num_rows() const { return num_rows_; } + /// \brief Return the number of parsed columns + int32_t num_cols() const { return num_cols_; } + /// \brief Return the total size in bytes of parsed data + uint32_t num_bytes() const { return parsed_size_; } + /// \brief Return the number of skipped rows + int32_t num_skipped_rows() const { return static_cast(skipped_rows_.size()); } + + template + Status VisitColumn(int32_t col_index, int64_t first_row, Visitor&& visit) const { + using detail::ParsedValueDesc; + + int32_t batch_row = 0; + for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) { + const auto& values_buffer = values_buffers_[buf_index]; + const auto values = reinterpret_cast(values_buffer->data()); + const auto max_pos = + static_cast(values_buffer->size() / sizeof(ParsedValueDesc)) - 1; + for (int32_t pos = col_index; pos < max_pos; pos += num_cols_, ++batch_row) { + auto start = values[pos].offset; + auto stop = values[pos + 1].offset; + auto quoted = values[pos + 1].quoted; + Status status = visit(parsed_ + start, stop - start, quoted); + if (ARROW_PREDICT_FALSE(!status.ok())) { + return DecorateWithRowNumber(std::move(status), first_row, batch_row); + } + } + } + return Status::OK(); + } + + template + Status VisitLastRow(Visitor&& visit) const { + using detail::ParsedValueDesc; + + const auto& values_buffer = values_buffers_.back(); + const auto values = reinterpret_cast(values_buffer->data()); + const auto start_pos = + static_cast(values_buffer->size() / sizeof(ParsedValueDesc)) - + num_cols_ - 1; + for (int32_t col_index = 0; col_index < num_cols_; ++col_index) { + auto start = values[start_pos + col_index].offset; + auto stop = values[start_pos + col_index + 1].offset; + auto quoted = values[start_pos + col_index + 1].quoted; + ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted)); + } + return Status::OK(); + } + + protected: + Status DecorateWithRowNumber(Status&& status, int64_t first_row, + int32_t batch_row) const { + if (first_row >= 0) { + // `skipped_rows_` is in ascending order by construction, so use bisection + // to find out how many rows were skipped before `batch_row`. + const auto skips_before = + std::upper_bound(skipped_rows_.begin(), skipped_rows_.end(), batch_row) - + skipped_rows_.begin(); + status = status.WithMessage("Row #", batch_row + skips_before + first_row, ": ", + status.message()); + } + // Use return_if so that when extra context is enabled it will be added + ARROW_RETURN_IF_(true, std::move(status), ARROW_STRINGIFY(status)); + return std::move(status); + } + + // The number of rows in this batch (not including any skipped ones) + int32_t num_rows_ = 0; + // The number of columns + int32_t num_cols_ = 0; + + // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes? + // It may help with null parsing... + std::vector> values_buffers_; + std::shared_ptr parsed_buffer_; + const uint8_t* parsed_ = NULLPTR; + int32_t parsed_size_ = 0; + + // Record the current num_rows_ each time a row is skipped + std::vector skipped_rows_; + + friend class ::arrow::csv::BlockParserImpl; +}; + +} // namespace detail + +constexpr int32_t kMaxParserNumRows = 100000; + +/// \class BlockParser +/// \brief A reusable block-based parser for CSV data +/// +/// The parser takes a block of CSV data and delimits rows and fields, +/// unquoting and unescaping them on the fly. Parsed data is own by the +/// parser, so the original buffer can be discarded after Parse() returns. +/// +/// If the block is truncated (i.e. not all data can be parsed), it is up +/// to the caller to arrange the next block to start with the trailing data. +/// Also, if the previous block ends with CR (0x0d) and a new block starts +/// with LF (0x0a), the parser will consider the leading newline as an empty +/// line; the caller should therefore strip it. +class ARROW_EXPORT BlockParser { + public: + explicit BlockParser(ParseOptions options, int32_t num_cols = -1, + int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows); + explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1, + int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows); + ~BlockParser(); + + /// \brief Parse a block of data + /// + /// Parse a block of CSV data, ingesting up to max_num_rows rows. + /// The number of bytes actually parsed is returned in out_size. + Status Parse(std::string_view data, uint32_t* out_size); + + /// \brief Parse sequential blocks of data + /// + /// Only the last block is allowed to be truncated. + Status Parse(const std::vector& data, uint32_t* out_size); + + /// \brief Parse the final block of data + /// + /// Like Parse(), but called with the final block in a file. + /// The last row may lack a trailing line separator. + Status ParseFinal(std::string_view data, uint32_t* out_size); + + /// \brief Parse the final sequential blocks of data + /// + /// Only the last block is allowed to be truncated. + Status ParseFinal(const std::vector& data, uint32_t* out_size); + + /// \brief Return the number of parsed rows + int32_t num_rows() const { return parsed_batch().num_rows(); } + /// \brief Return the number of parsed columns + int32_t num_cols() const { return parsed_batch().num_cols(); } + /// \brief Return the total size in bytes of parsed data + uint32_t num_bytes() const { return parsed_batch().num_bytes(); } + + /// \brief Return the total number of rows including rows which were skipped + int32_t total_num_rows() const { + return parsed_batch().num_rows() + parsed_batch().num_skipped_rows(); + } + + /// \brief Return the row number of the first row in the block or -1 if unsupported + int64_t first_row_num() const; + + /// \brief Visit parsed values in a column + /// + /// The signature of the visitor is + /// Status(const uint8_t* data, uint32_t size, bool quoted) + template + Status VisitColumn(int32_t col_index, Visitor&& visit) const { + return parsed_batch().VisitColumn(col_index, first_row_num(), + std::forward(visit)); + } + + template + Status VisitLastRow(Visitor&& visit) const { + return parsed_batch().VisitLastRow(std::forward(visit)); + } + + protected: + std::unique_ptr impl_; + + const detail::DataBatch& parsed_batch() const; +}; + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/reader.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/reader.h new file mode 100644 index 0000000000000000000000000000000000000000..bae301dc14815a6fdf9388a08c4f9068155f20a6 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/reader.h @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/csv/options.h" // IWYU pragma: keep +#include "arrow/io/interfaces.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/future.h" +#include "arrow/util/thread_pool.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { +class InputStream; +} // namespace io + +namespace csv { + +/// A class that reads an entire CSV file into a Arrow Table +class ARROW_EXPORT TableReader { + public: + virtual ~TableReader() = default; + + /// Read the entire CSV file and convert it to a Arrow Table + virtual Result> Read() = 0; + /// Read the entire CSV file and convert it to a Arrow Table + virtual Future> ReadAsync() = 0; + + /// Create a TableReader instance + static Result> Make(io::IOContext io_context, + std::shared_ptr input, + const ReadOptions&, + const ParseOptions&, + const ConvertOptions&); +}; + +/// \brief A class that reads a CSV file incrementally +/// +/// Caveats: +/// - For now, this is always single-threaded (regardless of `ReadOptions::use_threads`. +/// - Type inference is done on the first block and types are frozen afterwards; +/// to make sure the right data types are inferred, either set +/// `ReadOptions::block_size` to a large enough value, or use +/// `ConvertOptions::column_types` to set the desired data types explicitly. +class ARROW_EXPORT StreamingReader : public RecordBatchReader { + public: + virtual ~StreamingReader() = default; + + virtual Future> ReadNextAsync() = 0; + + /// \brief Return the number of bytes which have been read and processed + /// + /// The returned number includes CSV bytes which the StreamingReader has + /// finished processing, but not bytes for which some processing (e.g. + /// CSV parsing or conversion to Arrow layout) is still ongoing. + /// + /// Furthermore, the following rules apply: + /// - bytes skipped by `ReadOptions.skip_rows` are counted as being read before + /// any records are returned. + /// - bytes read while parsing the header are counted as being read before any + /// records are returned. + /// - bytes skipped by `ReadOptions.skip_rows_after_names` are counted after the + /// first batch is returned. + virtual int64_t bytes_read() const = 0; + + /// Create a StreamingReader instance + /// + /// This involves some I/O as the first batch must be loaded during the creation process + /// so it is returned as a future + /// + /// Currently, the StreamingReader is not async-reentrant and does not do any fan-out + /// parsing (see ARROW-11889) + static Future> MakeAsync( + io::IOContext io_context, std::shared_ptr input, + arrow::internal::Executor* cpu_executor, const ReadOptions&, const ParseOptions&, + const ConvertOptions&); + + static Result> Make( + io::IOContext io_context, std::shared_ptr input, + const ReadOptions&, const ParseOptions&, const ConvertOptions&); +}; + +/// \brief Count the logical rows of data in a CSV file (i.e. the +/// number of rows you would get if you read the file into a table). +ARROW_EXPORT +Future CountRowsAsync(io::IOContext io_context, + std::shared_ptr input, + arrow::internal::Executor* cpu_executor, + const ReadOptions&, const ParseOptions&); + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/test_common.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/test_common.h new file mode 100644 index 0000000000000000000000000000000000000000..07a41604478e81ac760e8d0b3501ef24996b0a4e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/test_common.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/csv/parser.h" +#include "arrow/testing/visibility.h" + +namespace arrow { +namespace csv { + +ARROW_TESTING_EXPORT +std::string MakeCSVData(std::vector lines); + +// Make a BlockParser from a vector of lines representing a CSV file +ARROW_TESTING_EXPORT +void MakeCSVParser(std::vector lines, ParseOptions options, int32_t num_cols, + MemoryPool* pool, std::shared_ptr* out); + +ARROW_TESTING_EXPORT +void MakeCSVParser(std::vector lines, ParseOptions options, + std::shared_ptr* out); + +ARROW_TESTING_EXPORT +void MakeCSVParser(std::vector lines, std::shared_ptr* out); + +// Make a BlockParser from a vector of strings representing a single CSV column +ARROW_TESTING_EXPORT +void MakeColumnParser(std::vector items, std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Result> MakeSampleCsvBuffer( + size_t num_rows, std::function is_valid = {}); + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/type_fwd.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..c0a53847a90ddb82067e0c9ac955cf4222c61742 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/type_fwd.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +namespace arrow { +namespace csv { + +class TableReader; +struct ConvertOptions; +struct ReadOptions; +struct ParseOptions; +struct WriteOptions; + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/writer.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/writer.h new file mode 100644 index 0000000000000000000000000000000000000000..d9d79e16608671859357e3adab88416fb0a9d04f --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/csv/writer.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/csv/options.h" +#include "arrow/io/interfaces.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/record_batch.h" +#include "arrow/table.h" + +namespace arrow { +namespace csv { + +// Functionality for converting Arrow data to Comma separated value text. +// This library supports all primitive types that can be cast to a StringArray or +// a LargeStringArray. +// It applies to following formatting rules: +// - For non-binary types no quotes surround values. Nulls are represented as the empty +// string. +// - For binary types all non-null data is quoted (and quotes within data are escaped +// with an additional quote). +// Null values are empty and unquoted. + +/// \defgroup csv-write-functions High-level functions for writing CSV files +/// @{ + +/// \brief Convert table to CSV and write the result to output. +/// Experimental +ARROW_EXPORT Status WriteCSV(const Table& table, const WriteOptions& options, + arrow::io::OutputStream* output); +/// \brief Convert batch to CSV and write the result to output. +/// Experimental +ARROW_EXPORT Status WriteCSV(const RecordBatch& batch, const WriteOptions& options, + arrow::io::OutputStream* output); +/// \brief Convert batches read through a RecordBatchReader +/// to CSV and write the results to output. +/// Experimental +ARROW_EXPORT Status WriteCSV(const std::shared_ptr& reader, + const WriteOptions& options, + arrow::io::OutputStream* output); + +/// @} + +/// \defgroup csv-writer-factories Functions for creating an incremental CSV writer +/// @{ + +/// \brief Create a new CSV writer. User is responsible for closing the +/// actual OutputStream. +/// +/// \param[in] sink output stream to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization +/// \return Result> +ARROW_EXPORT +Result> MakeCSVWriter( + std::shared_ptr sink, const std::shared_ptr& schema, + const WriteOptions& options = WriteOptions::Defaults()); + +/// \brief Create a new CSV writer. +/// +/// \param[in] sink output stream to write to (does not take ownership) +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization +/// \return Result> +ARROW_EXPORT +Result> MakeCSVWriter( + io::OutputStream* sink, const std::shared_ptr& schema, + const WriteOptions& options = WriteOptions::Defaults()); + +/// @} + +} // namespace csv +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/api.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/api.h new file mode 100644 index 0000000000000000000000000000000000000000..38caa1cff19def66d09d0d6ed25c67ce52259f9a --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/api.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include "arrow/compute/expression.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/discovery.h" +#include "arrow/dataset/file_base.h" +#ifdef ARROW_CSV +# include "arrow/dataset/file_csv.h" +#endif +#ifdef ARROW_JSON +# include "arrow/dataset/file_json.h" +#endif +#include "arrow/dataset/file_ipc.h" +#ifdef ARROW_ORC +# include "arrow/dataset/file_orc.h" +#endif +#ifdef ARROW_PARQUET +# include "arrow/dataset/file_parquet.h" +#endif +#include "arrow/dataset/scanner.h" diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset.h new file mode 100644 index 0000000000000000000000000000000000000000..1cdd92d5c42f2717c00b7bdeb2c7adc6117754b5 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset.h @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/compute/expression.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/util/async_generator_fwd.h" +#include "arrow/util/future.h" +#include "arrow/util/macros.h" +#include "arrow/util/mutex.h" + +namespace arrow { + +namespace internal { +class Executor; +} // namespace internal + +namespace dataset { + +using RecordBatchGenerator = std::function>()>; + +/// \brief Description of a column to scan +struct ARROW_DS_EXPORT FragmentSelectionColumn { + /// \brief The path to the column to load + FieldPath path; + /// \brief The type of the column in the dataset schema + /// + /// A format may choose to ignore this field completely. For example, when + /// reading from IPC the reader can just return the column in the data type + /// that is stored on disk. There is no point in doing anything special. + /// + /// However, some formats may be capable of casting on the fly. For example, + /// when reading from CSV, if we know the target type of the column, we can + /// convert from string to the target type as we read. + DataType* requested_type; +}; + +/// \brief A list of columns that should be loaded from a fragment +/// +/// The paths in this selection should be referring to the fragment schema. This class +/// contains a virtual destructor as it is expected evolution strategies will need to +/// extend this to add any information needed to later evolve the batches. +/// +/// For example, in the basic evolution strategy, we keep track of which columns +/// were missing from the file so that we can fill those in with null when evolving. +class ARROW_DS_EXPORT FragmentSelection { + public: + explicit FragmentSelection(std::vector columns) + : columns_(std::move(columns)) {} + virtual ~FragmentSelection() = default; + /// The columns that should be loaded from the fragment + const std::vector& columns() const { return columns_; } + + private: + std::vector columns_; +}; + +/// \brief Instructions for scanning a particular fragment +/// +/// The fragment scan request is derived from ScanV2Options. The main +/// difference is that the scan options are based on the dataset schema +/// while the fragment request is based on the fragment schema. +struct ARROW_DS_EXPORT FragmentScanRequest { + /// \brief A row filter + /// + /// The filter expression should be written against the fragment schema. + /// + /// \see ScanV2Options for details on how this filter should be applied + compute::Expression filter = compute::literal(true); + + /// \brief The columns to scan + /// + /// These indices refer to the fragment schema + /// + /// Note: This is NOT a simple list of top-level column indices. + /// For more details \see ScanV2Options + /// + /// If possible a fragment should only read from disk the data needed + /// to satisfy these columns. If a format cannot partially read a nested + /// column (e.g. JSON) then it must apply the column selection (in memory) + /// before returning the scanned batch. + std::shared_ptr fragment_selection; + /// \brief Options specific to the format being scanned + const FragmentScanOptions* format_scan_options; +}; + +/// \brief An iterator-like object that can yield batches created from a fragment +class ARROW_DS_EXPORT FragmentScanner { + public: + /// This instance will only be destroyed after all ongoing scan futures + /// have been completed. + /// + /// This means any callbacks created as part of the scan can safely + /// capture `this` + virtual ~FragmentScanner() = default; + /// \brief Scan a batch of data from the file + /// \param batch_number The index of the batch to read + virtual Future> ScanBatch(int batch_number) = 0; + /// \brief Calculate an estimate of how many data bytes the given batch will represent + /// + /// "Data bytes" should be the total size of all the buffers once the data has been + /// decoded into the Arrow format. + virtual int64_t EstimatedDataBytes(int batch_number) = 0; + /// \brief The number of batches in the fragment to scan + virtual int NumBatches() = 0; +}; + +/// \brief Information learned about a fragment through inspection +/// +/// This information can be used to figure out which fields need +/// to be read from a file and how the data read in should be evolved +/// to match the dataset schema. +/// +/// For example, from a CSV file we can inspect and learn the column +/// names and use those column names to determine which columns to load +/// from the CSV file. +struct ARROW_DS_EXPORT InspectedFragment { + explicit InspectedFragment(std::vector column_names) + : column_names(std::move(column_names)) {} + std::vector column_names; +}; + +/// \brief A granular piece of a Dataset, such as an individual file. +/// +/// A Fragment can be read/scanned separately from other fragments. It yields a +/// collection of RecordBatches when scanned +/// +/// Note that Fragments have well defined physical schemas which are reconciled by +/// the Datasets which contain them; these physical schemas may differ from a parent +/// Dataset's schema and the physical schemas of sibling Fragments. +class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this { + public: + /// \brief An expression that represents no known partition information + static const compute::Expression kNoPartitionInformation; + + /// \brief Return the physical schema of the Fragment. + /// + /// The physical schema is also called the writer schema. + /// This method is blocking and may suffer from high latency filesystem. + /// The schema is cached after being read once, or may be specified at construction. + Result> ReadPhysicalSchema(); + + /// An asynchronous version of Scan + virtual Result ScanBatchesAsync( + const std::shared_ptr& options) = 0; + + /// \brief Inspect a fragment to learn basic information + /// + /// This will be called before a scan and a fragment should attach whatever + /// information will be needed to figure out an evolution strategy. This information + /// will then be passed to the call to BeginScan + virtual Future> InspectFragment( + const FragmentScanOptions* format_options, compute::ExecContext* exec_context); + + /// \brief Start a scan operation + virtual Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, compute::ExecContext* exec_context); + + /// \brief Count the number of rows in this fragment matching the filter using metadata + /// only. That is, this method may perform I/O, but will not load data. + /// + /// If this is not possible, resolve with an empty optional. The fragment can perform + /// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request. + virtual Future> CountRows( + compute::Expression predicate, const std::shared_ptr& options); + + virtual std::string type_name() const = 0; + virtual std::string ToString() const { return type_name(); } + + /// \brief An expression which evaluates to true for all data viewed by this + /// Fragment. + const compute::Expression& partition_expression() const { + return partition_expression_; + } + + virtual ~Fragment() = default; + + protected: + Fragment() = default; + explicit Fragment(compute::Expression partition_expression, + std::shared_ptr physical_schema); + + virtual Result> ReadPhysicalSchemaImpl() = 0; + + util::Mutex physical_schema_mutex_; + compute::Expression partition_expression_ = compute::literal(true); + std::shared_ptr physical_schema_; +}; + +/// \brief Per-scan options for fragment(s) in a dataset. +/// +/// These options are not intrinsic to the format or fragment itself, but do affect +/// the results of a scan. These are options which make sense to change between +/// repeated reads of the same dataset, such as format-specific conversion options +/// (that do not affect the schema). +/// +/// \ingroup dataset-scanning +class ARROW_DS_EXPORT FragmentScanOptions { + public: + virtual std::string type_name() const = 0; + virtual std::string ToString() const { return type_name(); } + virtual ~FragmentScanOptions() = default; +}; + +/// \defgroup dataset-implementations Concrete implementations +/// +/// @{ + +/// \brief A trivial Fragment that yields ScanTask out of a fixed set of +/// RecordBatch. +class ARROW_DS_EXPORT InMemoryFragment : public Fragment { + public: + class Scanner; + InMemoryFragment(std::shared_ptr schema, RecordBatchVector record_batches, + compute::Expression = compute::literal(true)); + explicit InMemoryFragment(RecordBatchVector record_batches, + compute::Expression = compute::literal(true)); + + Result ScanBatchesAsync( + const std::shared_ptr& options) override; + Future> CountRows( + compute::Expression predicate, + const std::shared_ptr& options) override; + + Future> InspectFragment( + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) override; + Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) override; + + std::string type_name() const override { return "in-memory"; } + + protected: + Result> ReadPhysicalSchemaImpl() override; + + RecordBatchVector record_batches_; +}; + +/// @} + +using FragmentGenerator = AsyncGenerator>; + +/// \brief Rules for converting the dataset schema to and from fragment schemas +class ARROW_DS_EXPORT FragmentEvolutionStrategy { + public: + /// This instance will only be destroyed when all scan operations for the + /// fragment have completed. + virtual ~FragmentEvolutionStrategy() = default; + /// \brief A guarantee that applies to all batches of this fragment + /// + /// For example, if a fragment is missing one of the fields in the dataset + /// schema then a typical evolution strategy is to set that field to null. + /// + /// So if the column at index 3 is missing then the guarantee is + /// FieldRef(3) == null + /// + /// Individual field guarantees should be AND'd together and returned + /// as a single expression. + virtual Result GetGuarantee( + const std::vector& dataset_schema_selection) const = 0; + + /// \brief Return a fragment schema selection given a dataset schema selection + /// + /// For example, if the user wants fields 2 & 4 of the dataset schema and + /// in this fragment the field 2 is missing and the field 4 is at index 1 then + /// this should return {1} + virtual Result> DevolveSelection( + const std::vector& dataset_schema_selection) const = 0; + + /// \brief Return a filter expression bound to the fragment schema given + /// a filter expression bound to the dataset schema + /// + /// The dataset scan filter will first be simplified by the guarantee returned + /// by GetGuarantee. This means an evolution that only handles dropping or casting + /// fields doesn't need to do anything here except return the given filter. + /// + /// On the other hand, an evolution that is doing some kind of aliasing will likely + /// need to convert field references in the filter to the aliased field references + /// where appropriate. + virtual Result DevolveFilter( + const compute::Expression& filter) const = 0; + + /// \brief Convert a batch from the fragment schema to the dataset schema + /// + /// Typically this involves casting columns from the data type stored on disk + /// to the data type of the dataset schema. For example, this fragment might + /// have columns stored as int32 and the dataset schema might have int64 for + /// the column. In this case we should cast the column from int32 to int64. + /// + /// Note: A fragment may perform this cast as the data is read from disk. In + /// that case a cast might not be needed. + virtual Result EvolveBatch( + const std::shared_ptr& batch, + const std::vector& dataset_selection, + const FragmentSelection& selection) const = 0; + + /// \brief Return a string description of this strategy + virtual std::string ToString() const = 0; +}; + +/// \brief Lookup to create a FragmentEvolutionStrategy for a given fragment +class ARROW_DS_EXPORT DatasetEvolutionStrategy { + public: + virtual ~DatasetEvolutionStrategy() = default; + /// \brief Create a strategy for evolving from the given fragment + /// to the schema of the given dataset + virtual std::unique_ptr GetStrategy( + const Dataset& dataset, const Fragment& fragment, + const InspectedFragment& inspected_fragment) = 0; + + /// \brief Return a string description of this strategy + virtual std::string ToString() const = 0; +}; + +ARROW_DS_EXPORT std::unique_ptr +MakeBasicDatasetEvolutionStrategy(); + +/// \brief A container of zero or more Fragments. +/// +/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a +/// directory. A Dataset has a schema to which Fragments must align during a +/// scan operation. This is analogous to Avro's reader and writer schema. +class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this { + public: + /// \brief Begin to build a new Scan operation against this Dataset + Result> NewScan(); + + /// \brief GetFragments returns an iterator of Fragments given a predicate. + Result GetFragments(compute::Expression predicate); + Result GetFragments(); + + /// \brief Async versions of `GetFragments`. + Result GetFragmentsAsync(compute::Expression predicate); + Result GetFragmentsAsync(); + + const std::shared_ptr& schema() const { return schema_; } + + /// \brief An expression which evaluates to true for all data viewed by this Dataset. + /// May be null, which indicates no information is available. + const compute::Expression& partition_expression() const { + return partition_expression_; + } + + /// \brief The name identifying the kind of Dataset + virtual std::string type_name() const = 0; + + /// \brief Return a copy of this Dataset with a different schema. + /// + /// The copy will view the same Fragments. If the new schema is not compatible with the + /// original dataset's schema then an error will be raised. + virtual Result> ReplaceSchema( + std::shared_ptr schema) const = 0; + + /// \brief Rules used by this dataset to handle schema evolution + DatasetEvolutionStrategy* evolution_strategy() { return evolution_strategy_.get(); } + + virtual ~Dataset() = default; + + protected: + explicit Dataset(std::shared_ptr schema) : schema_(std::move(schema)) {} + + Dataset(std::shared_ptr schema, compute::Expression partition_expression); + + virtual Result GetFragmentsImpl(compute::Expression predicate) = 0; + /// \brief Default non-virtual implementation method for the base + /// `GetFragmentsAsyncImpl` method, which creates a fragment generator for + /// the dataset, possibly filtering results with a predicate (forwarding to + /// the synchronous `GetFragmentsImpl` method and moving the computations + /// to the background, using the IO thread pool). + /// + /// Currently, `executor` is always the same as `internal::GetCPUThreadPool()`, + /// which means the results from the underlying fragment generator will be + /// transferred to the default CPU thread pool. The generator itself is + /// offloaded to run on the default IO thread pool. + virtual Result GetFragmentsAsyncImpl( + compute::Expression predicate, arrow::internal::Executor* executor); + + std::shared_ptr schema_; + compute::Expression partition_expression_ = compute::literal(true); + std::unique_ptr evolution_strategy_ = + MakeBasicDatasetEvolutionStrategy(); +}; + +/// \addtogroup dataset-implementations +/// +/// @{ + +/// \brief A Source which yields fragments wrapping a stream of record batches. +/// +/// The record batches must match the schema provided to the source at construction. +class ARROW_DS_EXPORT InMemoryDataset : public Dataset { + public: + class RecordBatchGenerator { + public: + virtual ~RecordBatchGenerator() = default; + virtual RecordBatchIterator Get() const = 0; + }; + + /// Construct a dataset from a schema and a factory of record batch iterators. + InMemoryDataset(std::shared_ptr schema, + std::shared_ptr get_batches) + : Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {} + + /// Convenience constructor taking a fixed list of batches + InMemoryDataset(std::shared_ptr schema, RecordBatchVector batches); + + /// Convenience constructor taking a Table + explicit InMemoryDataset(std::shared_ptr
table); + + std::string type_name() const override { return "in-memory"; } + + Result> ReplaceSchema( + std::shared_ptr schema) const override; + + protected: + Result GetFragmentsImpl(compute::Expression predicate) override; + + std::shared_ptr get_batches_; +}; + +/// \brief A Dataset wrapping child Datasets. +class ARROW_DS_EXPORT UnionDataset : public Dataset { + public: + /// \brief Construct a UnionDataset wrapping child Datasets. + /// + /// \param[in] schema the schema of the resulting dataset. + /// \param[in] children one or more child Datasets. Their schemas must be identical to + /// schema. + static Result> Make(std::shared_ptr schema, + DatasetVector children); + + const DatasetVector& children() const { return children_; } + + std::string type_name() const override { return "union"; } + + Result> ReplaceSchema( + std::shared_ptr schema) const override; + + protected: + Result GetFragmentsImpl(compute::Expression predicate) override; + + explicit UnionDataset(std::shared_ptr schema, DatasetVector children) + : Dataset(std::move(schema)), children_(std::move(children)) {} + + DatasetVector children_; + + friend class UnionDatasetFactory; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset_writer.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset_writer.h new file mode 100644 index 0000000000000000000000000000000000000000..edb1649b5f196aa3c6cd923c9e6540c4173fc102 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset_writer.h @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/util/async_util.h" +#include "arrow/util/future.h" + +namespace arrow { +namespace dataset { +namespace internal { + +// This lines up with our other defaults in the scanner and execution plan +constexpr uint64_t kDefaultDatasetWriterMaxRowsQueued = 8 * 1024 * 1024; + +/// \brief Utility class that manages a set of writers to different paths +/// +/// Writers may be closed and reopened (and a new file created) based on the dataset +/// write options (for example, max_rows_per_file or max_open_files) +/// +/// The dataset writer enforces its own back pressure based on the # of rows (as opposed +/// to # of batches which is how it is typically enforced elsewhere) and # of files. +class ARROW_DS_EXPORT DatasetWriter { + public: + /// \brief Create a dataset writer + /// + /// Will fail if basename_template is invalid or if there is existing data and + /// existing_data_behavior is kError + /// + /// \param write_options options to control how the data should be written + /// \param max_rows_queued max # of rows allowed to be queued before the dataset_writer + /// will ask for backpressure + static Result> Make( + FileSystemDatasetWriteOptions write_options, util::AsyncTaskScheduler* scheduler, + std::function pause_callback, std::function resume_callback, + std::function finish_callback, + uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued); + + ~DatasetWriter(); + + /// \brief Write a batch to the dataset + /// \param[in] batch The batch to write + /// \param[in] directory The directory to write to + /// + /// Note: The written filename will be {directory}/{filename_factory(i)} where i is a + /// counter controlled by `max_open_files` and `max_rows_per_file` + /// + /// If multiple WriteRecordBatch calls arrive with the same `directory` then the batches + /// may be written to the same file. + /// + /// The returned future will be marked finished when the record batch has been queued + /// to be written. If the returned future is unfinished then this indicates the dataset + /// writer's queue is full and the data provider should pause. + /// + /// This method is NOT async reentrant. The returned future will only be unfinished + /// if back pressure needs to be applied. Async reentrancy is not necessary for + /// concurrent writes to happen. Calling this method again before the previous future + /// completes will not just violate max_rows_queued but likely lead to race conditions. + /// + /// One thing to note is that the ordering of your data can affect your maximum + /// potential parallelism. If this seems odd then consider a dataset where the first + /// 1000 batches go to the same directory and then the 1001st batch goes to a different + /// directory. The only way to get two parallel writes immediately would be to queue + /// all 1000 pending writes to the first directory. + void WriteRecordBatch(std::shared_ptr batch, const std::string& directory, + const std::string& prefix = ""); + + /// Finish all pending writes and close any open files + void Finish(); + + protected: + DatasetWriter(FileSystemDatasetWriteOptions write_options, + util::AsyncTaskScheduler* scheduler, std::function pause_callback, + std::function resume_callback, + std::function finish_callback, + uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued); + + class DatasetWriterImpl; + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/discovery.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/discovery.h new file mode 100644 index 0000000000000000000000000000000000000000..6d76dcef727e7643ba559d8802665755a4f8a870 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/discovery.h @@ -0,0 +1,275 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Logic for automatically determining the structure of multi-file +/// dataset with possible partitioning according to available +/// partitioning + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/dataset/partition.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/filesystem/type_fwd.h" +#include "arrow/result.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace dataset { + +/// \defgroup dataset-discovery Discovery API +/// +/// @{ + +struct InspectOptions { + /// See `fragments` property. + static constexpr int kInspectAllFragments = -1; + + /// Indicate how many fragments should be inspected to infer the unified dataset + /// schema. Limiting the number of fragments accessed improves the latency of + /// the discovery process when dealing with a high number of fragments and/or + /// high latency file systems. + /// + /// The default value of `1` inspects the schema of the first (in no particular + /// order) fragment only. If the dataset has a uniform schema for all fragments, + /// this default is the optimal value. In order to inspect all fragments and + /// robustly unify their potentially varying schemas, set this option to + /// `kInspectAllFragments`. A value of `0` disables inspection of fragments + /// altogether so only the partitioning schema will be inspected. + int fragments = 1; + + /// Control how to unify types. By default, types are merged strictly (the + /// type must match exactly, except nulls can be merged with other types). + Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults(); +}; + +struct FinishOptions { + /// Finalize the dataset with this given schema. If the schema is not + /// provided, infer the schema via the Inspect, see the `inspect_options` + /// property. + std::shared_ptr schema = NULLPTR; + + /// If the schema is not provided, it will be discovered by passing the + /// following options to `DatasetDiscovery::Inspect`. + InspectOptions inspect_options{}; + + /// Indicate if the given Schema (when specified), should be validated against + /// the fragments' schemas. `inspect_options` will control how many fragments + /// are checked. + bool validate_fragments = false; +}; + +/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected +/// schema before materializing said Dataset. +class ARROW_DS_EXPORT DatasetFactory { + public: + /// \brief Get the schemas of the Fragments and Partitioning. + virtual Result>> InspectSchemas( + InspectOptions options) = 0; + + /// \brief Get unified schema for the resulting Dataset. + Result> Inspect(InspectOptions options = {}); + + /// \brief Create a Dataset + Result> Finish(); + /// \brief Create a Dataset with the given schema (see \a InspectOptions::schema) + Result> Finish(std::shared_ptr schema); + /// \brief Create a Dataset with the given options + virtual Result> Finish(FinishOptions options) = 0; + + /// \brief Optional root partition for the resulting Dataset. + const compute::Expression& root_partition() const { return root_partition_; } + /// \brief Set the root partition for the resulting Dataset. + Status SetRootPartition(compute::Expression partition) { + root_partition_ = std::move(partition); + return Status::OK(); + } + + virtual ~DatasetFactory() = default; + + protected: + DatasetFactory(); + + compute::Expression root_partition_; +}; + +/// @} + +/// \brief DatasetFactory provides a way to inspect/discover a Dataset's +/// expected schema before materialization. +/// \ingroup dataset-implementations +class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory { + public: + static Result> Make( + std::vector> factories); + + /// \brief Return the list of child DatasetFactory + const std::vector>& factories() const { + return factories_; + } + + /// \brief Get the schemas of the Datasets. + /// + /// Instead of applying options globally, it applies at each child factory. + /// This will not respect `options.fragments` exactly, but will respect the + /// spirit of peeking the first fragments or all of them. + Result>> InspectSchemas( + InspectOptions options) override; + + /// \brief Create a Dataset. + Result> Finish(FinishOptions options) override; + + protected: + explicit UnionDatasetFactory(std::vector> factories); + + std::vector> factories_; +}; + +/// \ingroup dataset-filesystem +struct FileSystemFactoryOptions { + /// Either an explicit Partitioning or a PartitioningFactory to discover one. + /// + /// If a factory is provided, it will be used to infer a schema for partition fields + /// based on file and directory paths then construct a Partitioning. The default + /// is a Partitioning which will yield no partition information. + /// + /// The (explicit or discovered) partitioning will be applied to discovered files + /// and the resulting partition information embedded in the Dataset. + PartitioningOrFactory partitioning{Partitioning::Default()}; + + /// For the purposes of applying the partitioning, paths will be stripped + /// of the partition_base_dir. Files not matching the partition_base_dir + /// prefix will be skipped for partition discovery. The ignored files will still + /// be part of the Dataset, but will not have partition information. + /// + /// Example: + /// partition_base_dir = "/dataset"; + /// + /// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning + /// + /// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery. + /// + /// This is useful for partitioning which parses directory when ordering + /// is important, e.g. DirectoryPartitioning. + std::string partition_base_dir; + + /// Invalid files (via selector or explicitly) will be excluded by checking + /// with the FileFormat::IsSupported method. This will incur IO for each files + /// in a serial and single threaded fashion. Disabling this feature will skip the + /// IO, but unsupported files may be present in the Dataset + /// (resulting in an error at scan time). + bool exclude_invalid_files = false; + + /// When discovering from a Selector (and not from an explicit file list), ignore + /// files and directories matching any of these prefixes. + /// + /// Example (with selector = "/dataset/**"): + /// selector_ignore_prefixes = {"_", ".DS_STORE" }; + /// + /// - "/dataset/data.csv" -> not ignored + /// - "/dataset/_metadata" -> ignored + /// - "/dataset/.DS_STORE" -> ignored + /// - "/dataset/_hidden/dat" -> ignored + /// - "/dataset/nested/.DS_STORE" -> ignored + std::vector selector_ignore_prefixes = { + ".", + "_", + }; +}; + +/// \brief FileSystemDatasetFactory creates a Dataset from a vector of +/// fs::FileInfo or a fs::FileSelector. +/// \ingroup dataset-filesystem +class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory { + public: + /// \brief Build a FileSystemDatasetFactory from an explicit list of + /// paths. + /// + /// \param[in] filesystem passed to FileSystemDataset + /// \param[in] paths passed to FileSystemDataset + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make( + std::shared_ptr filesystem, const std::vector& paths, + std::shared_ptr format, FileSystemFactoryOptions options); + + /// \brief Build a FileSystemDatasetFactory from a fs::FileSelector. + /// + /// The selector will expand to a vector of FileInfo. The expansion/crawling + /// is performed in this function call. Thus, the finalized Dataset is + /// working with a snapshot of the filesystem. + // + /// If options.partition_base_dir is not provided, it will be overwritten + /// with selector.base_dir. + /// + /// \param[in] filesystem passed to FileSystemDataset + /// \param[in] selector used to crawl and search files + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make( + std::shared_ptr filesystem, fs::FileSelector selector, + std::shared_ptr format, FileSystemFactoryOptions options); + + /// \brief Build a FileSystemDatasetFactory from an uri including filesystem + /// information. + /// + /// \param[in] uri passed to FileSystemDataset + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make(std::string uri, + std::shared_ptr format, + FileSystemFactoryOptions options); + + /// \brief Build a FileSystemDatasetFactory from an explicit list of + /// file information. + /// + /// \param[in] filesystem passed to FileSystemDataset + /// \param[in] files passed to FileSystemDataset + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make( + std::shared_ptr filesystem, const std::vector& files, + std::shared_ptr format, FileSystemFactoryOptions options); + + Result>> InspectSchemas( + InspectOptions options) override; + + Result> Finish(FinishOptions options) override; + + protected: + FileSystemDatasetFactory(std::vector files, + std::shared_ptr filesystem, + std::shared_ptr format, + FileSystemFactoryOptions options); + + Result> PartitionSchema(); + + std::vector files_; + std::shared_ptr fs_; + std::shared_ptr format_; + FileSystemFactoryOptions options_; +}; + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_base.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_base.h new file mode 100644 index 0000000000000000000000000000000000000000..46fc8ebc40db097a0bb3fc25f00351c68e36991f --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_base.h @@ -0,0 +1,495 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/partition.h" +#include "arrow/dataset/scanner.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/filesystem/filesystem.h" +#include "arrow/io/file.h" +#include "arrow/type_fwd.h" +#include "arrow/util/compression.h" + +namespace arrow { + +namespace dataset { + +/// \defgroup dataset-file-formats File formats for reading and writing datasets +/// \defgroup dataset-filesystem File system datasets +/// +/// @{ + +/// \brief The path and filesystem where an actual file is located or a buffer which can +/// be read like a file +class ARROW_DS_EXPORT FileSource : public util::EqualityComparable { + public: + FileSource(std::string path, std::shared_ptr filesystem, + Compression::type compression = Compression::UNCOMPRESSED) + : file_info_(std::move(path)), + filesystem_(std::move(filesystem)), + compression_(compression) {} + + FileSource(fs::FileInfo info, std::shared_ptr filesystem, + Compression::type compression = Compression::UNCOMPRESSED) + : file_info_(std::move(info)), + filesystem_(std::move(filesystem)), + compression_(compression) {} + + explicit FileSource(std::shared_ptr buffer, + Compression::type compression = Compression::UNCOMPRESSED) + : buffer_(std::move(buffer)), compression_(compression) {} + + using CustomOpen = std::function>()>; + FileSource(CustomOpen open, int64_t size) + : custom_open_(std::move(open)), custom_size_(size) {} + + using CustomOpenWithCompression = + std::function>(Compression::type)>; + FileSource(CustomOpenWithCompression open_with_compression, int64_t size, + Compression::type compression = Compression::UNCOMPRESSED) + : custom_open_(std::bind(std::move(open_with_compression), compression)), + custom_size_(size), + compression_(compression) {} + + FileSource(std::shared_ptr file, int64_t size, + Compression::type compression = Compression::UNCOMPRESSED) + : custom_open_([=] { return ToResult(file); }), + custom_size_(size), + compression_(compression) {} + + explicit FileSource(std::shared_ptr file, + Compression::type compression = Compression::UNCOMPRESSED); + + FileSource() : custom_open_(CustomOpen{&InvalidOpen}) {} + + static std::vector FromPaths(const std::shared_ptr& fs, + std::vector paths) { + std::vector sources; + for (auto&& path : paths) { + sources.emplace_back(std::move(path), fs); + } + return sources; + } + + /// \brief Return the type of raw compression on the file, if any. + Compression::type compression() const { return compression_; } + + /// \brief Return the file path, if any. Only valid when file source wraps a path. + const std::string& path() const { + static std::string buffer_path = ""; + static std::string custom_open_path = ""; + return filesystem_ ? file_info_.path() : buffer_ ? buffer_path : custom_open_path; + } + + /// \brief Return the filesystem, if any. Otherwise returns nullptr + const std::shared_ptr& filesystem() const { return filesystem_; } + + /// \brief Return the buffer containing the file, if any. Otherwise returns nullptr + const std::shared_ptr& buffer() const { return buffer_; } + + /// \brief Get a RandomAccessFile which views this file source + Result> Open() const; + Future> OpenAsync() const; + + /// \brief Get the size (in bytes) of the file or buffer + /// If the file is compressed this should be the compressed (on-disk) size. + int64_t Size() const; + + /// \brief Get an InputStream which views this file source (and decompresses if needed) + /// \param[in] compression If nullopt, guess the compression scheme from the + /// filename, else decompress with the given codec + Result> OpenCompressed( + std::optional compression = std::nullopt) const; + + /// \brief equality comparison with another FileSource + bool Equals(const FileSource& other) const; + + private: + static Result> InvalidOpen() { + return Status::Invalid("Called Open() on an uninitialized FileSource"); + } + + fs::FileInfo file_info_; + std::shared_ptr filesystem_; + std::shared_ptr buffer_; + CustomOpen custom_open_; + int64_t custom_size_ = 0; + Compression::type compression_ = Compression::UNCOMPRESSED; +}; + +/// \brief Base class for file format implementation +class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this { + public: + /// Options affecting how this format is scanned. + /// + /// The options here can be overridden at scan time. + std::shared_ptr default_fragment_scan_options; + + virtual ~FileFormat() = default; + + /// \brief The name identifying the kind of file format + virtual std::string type_name() const = 0; + + virtual bool Equals(const FileFormat& other) const = 0; + + /// \brief Indicate if the FileSource is supported/readable by this format. + virtual Result IsSupported(const FileSource& source) const = 0; + + /// \brief Return the schema of the file if possible. + virtual Result> Inspect(const FileSource& source) const = 0; + + /// \brief Learn what we need about the file before we start scanning it + virtual Future> InspectFragment( + const FileSource& source, const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const; + + virtual Result ScanBatchesAsync( + const std::shared_ptr& options, + const std::shared_ptr& file) const = 0; + + virtual Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options); + + virtual Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const; + + /// \brief Open a fragment + virtual Result> MakeFragment( + FileSource source, compute::Expression partition_expression, + std::shared_ptr physical_schema); + + /// \brief Create a FileFragment for a FileSource. + Result> MakeFragment( + FileSource source, compute::Expression partition_expression); + + /// \brief Create a FileFragment for a FileSource. + Result> MakeFragment( + FileSource source, std::shared_ptr physical_schema = NULLPTR); + + /// \brief Create a writer for this format. + virtual Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const = 0; + + /// \brief Get default write options for this format. + /// + /// May return null shared_ptr if this file format does not yet support + /// writing datasets. + virtual std::shared_ptr DefaultWriteOptions() = 0; + + protected: + explicit FileFormat(std::shared_ptr default_fragment_scan_options) + : default_fragment_scan_options(std::move(default_fragment_scan_options)) {} +}; + +/// \brief A Fragment that is stored in a file with a known format +class ARROW_DS_EXPORT FileFragment : public Fragment, + public util::EqualityComparable { + public: + Result ScanBatchesAsync( + const std::shared_ptr& options) override; + Future> CountRows( + compute::Expression predicate, + const std::shared_ptr& options) override; + Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) override; + Future> InspectFragment( + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) override; + + std::string type_name() const override { return format_->type_name(); } + std::string ToString() const override { return source_.path(); }; + + const FileSource& source() const { return source_; } + const std::shared_ptr& format() const { return format_; } + + bool Equals(const FileFragment& other) const; + + protected: + FileFragment(FileSource source, std::shared_ptr format, + compute::Expression partition_expression, + std::shared_ptr physical_schema) + : Fragment(std::move(partition_expression), std::move(physical_schema)), + source_(std::move(source)), + format_(std::move(format)) {} + + Result> ReadPhysicalSchemaImpl() override; + + FileSource source_; + std::shared_ptr format_; + + friend class FileFormat; +}; + +/// \brief A Dataset of FileFragments. +/// +/// A FileSystemDataset is composed of one or more FileFragment. The fragments +/// are independent and don't need to share the same format and/or filesystem. +class ARROW_DS_EXPORT FileSystemDataset : public Dataset { + public: + /// \brief Create a FileSystemDataset. + /// + /// \param[in] schema the schema of the dataset + /// \param[in] root_partition the partition expression of the dataset + /// \param[in] format the format of each FileFragment. + /// \param[in] filesystem the filesystem of each FileFragment, or nullptr if the + /// fragments wrap buffers. + /// \param[in] fragments list of fragments to create the dataset from. + /// \param[in] partitioning the Partitioning object in case the dataset is created + /// with a known partitioning (e.g. from a discovered partitioning + /// through a DatasetFactory), or nullptr if not known. + /// + /// Note that fragments wrapping files resident in differing filesystems are not + /// permitted; to work with multiple filesystems use a UnionDataset. + /// + /// \return A constructed dataset. + static Result> Make( + std::shared_ptr schema, compute::Expression root_partition, + std::shared_ptr format, std::shared_ptr filesystem, + std::vector> fragments, + std::shared_ptr partitioning = NULLPTR); + + /// \brief Write a dataset. + static Status Write(const FileSystemDatasetWriteOptions& write_options, + std::shared_ptr scanner); + + /// \brief Return the type name of the dataset. + std::string type_name() const override { return "filesystem"; } + + /// \brief Replace the schema of the dataset. + Result> ReplaceSchema( + std::shared_ptr schema) const override; + + /// \brief Return the path of files. + std::vector files() const; + + /// \brief Return the format. + const std::shared_ptr& format() const { return format_; } + + /// \brief Return the filesystem. May be nullptr if the fragments wrap buffers. + const std::shared_ptr& filesystem() const { return filesystem_; } + + /// \brief Return the partitioning. May be nullptr if the dataset was not constructed + /// with a partitioning. + const std::shared_ptr& partitioning() const { return partitioning_; } + + std::string ToString() const; + + protected: + struct FragmentSubtrees; + + explicit FileSystemDataset(std::shared_ptr schema) + : Dataset(std::move(schema)) {} + + FileSystemDataset(std::shared_ptr schema, + compute::Expression partition_expression) + : Dataset(std::move(schema), partition_expression) {} + + Result GetFragmentsImpl(compute::Expression predicate) override; + + void SetupSubtreePruning(); + + std::shared_ptr format_; + std::shared_ptr filesystem_; + std::vector> fragments_; + std::shared_ptr partitioning_; + + std::shared_ptr subtrees_; +}; + +/// \brief Options for writing a file of this format. +class ARROW_DS_EXPORT FileWriteOptions { + public: + virtual ~FileWriteOptions() = default; + + const std::shared_ptr& format() const { return format_; } + + std::string type_name() const { return format_->type_name(); } + + protected: + explicit FileWriteOptions(std::shared_ptr format) + : format_(std::move(format)) {} + + std::shared_ptr format_; +}; + +/// \brief A writer for this format. +class ARROW_DS_EXPORT FileWriter { + public: + virtual ~FileWriter() = default; + + /// \brief Write the given batch. + virtual Status Write(const std::shared_ptr& batch) = 0; + + /// \brief Write all batches from the reader. + Status Write(RecordBatchReader* batches); + + /// \brief Indicate that writing is done. + virtual Future<> Finish(); + + const std::shared_ptr& format() const { return options_->format(); } + const std::shared_ptr& schema() const { return schema_; } + const std::shared_ptr& options() const { return options_; } + const fs::FileLocator& destination() const { return destination_locator_; } + + /// \brief After Finish() is called, provides number of bytes written to file. + Result GetBytesWritten() const; + + protected: + FileWriter(std::shared_ptr schema, std::shared_ptr options, + std::shared_ptr destination, + fs::FileLocator destination_locator) + : schema_(std::move(schema)), + options_(std::move(options)), + destination_(std::move(destination)), + destination_locator_(std::move(destination_locator)) {} + + virtual Future<> FinishInternal() = 0; + + std::shared_ptr schema_; + std::shared_ptr options_; + std::shared_ptr destination_; + fs::FileLocator destination_locator_; + std::optional bytes_written_; +}; + +/// \brief Options for writing a dataset. +struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { + /// Options for individual fragment writing. + std::shared_ptr file_write_options; + + /// FileSystem into which a dataset will be written. + std::shared_ptr filesystem; + + /// Root directory into which the dataset will be written. + std::string base_dir; + + /// Partitioning used to generate fragment paths. + std::shared_ptr partitioning; + + /// Maximum number of partitions any batch may be written into, default is 1K. + int max_partitions = 1024; + + /// Template string used to generate fragment basenames. + /// {i} will be replaced by an auto incremented integer. + std::string basename_template; + + /// A functor which will be applied on an incremented counter. The result will be + /// inserted into the basename_template in place of {i}. + /// + /// This can be used, for example, to left-pad the file counter. + std::function basename_template_functor; + + /// If greater than 0 then this will limit the maximum number of files that can be left + /// open. If an attempt is made to open too many files then the least recently used file + /// will be closed. If this setting is set too low you may end up fragmenting your data + /// into many small files. + /// + /// The default is 900 which also allows some # of files to be open by the scanner + /// before hitting the default Linux limit of 1024 + uint32_t max_open_files = 900; + + /// If greater than 0 then this will limit how many rows are placed in any single file. + /// Otherwise there will be no limit and one file will be created in each output + /// directory unless files need to be closed to respect max_open_files + uint64_t max_rows_per_file = 0; + + /// If greater than 0 then this will cause the dataset writer to batch incoming data + /// and only write the row groups to the disk when sufficient rows have accumulated. + /// The final row group size may be less than this value and other options such as + /// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes. + uint64_t min_rows_per_group = 0; + + /// If greater than 0 then the dataset writer may split up large incoming batches into + /// multiple row groups. If this value is set then min_rows_per_group should also be + /// set or else you may end up with very small row groups (e.g. if the incoming row + /// group size is just barely larger than this value). + uint64_t max_rows_per_group = 1 << 20; + + /// Controls what happens if an output directory already exists. + ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError; + + /// \brief If false the dataset writer will not create directories + /// This is mainly intended for filesystems that do not require directories such as S3. + bool create_dir = true; + + /// Callback to be invoked against all FileWriters before + /// they are finalized with FileWriter::Finish(). + std::function writer_pre_finish = [](FileWriter*) { + return Status::OK(); + }; + + /// Callback to be invoked against all FileWriters after they have + /// called FileWriter::Finish(). + std::function writer_post_finish = [](FileWriter*) { + return Status::OK(); + }; + + const std::shared_ptr& format() const { + return file_write_options->format(); + } +}; + +/// \brief Wraps FileSystemDatasetWriteOptions for consumption as compute::ExecNodeOptions +class ARROW_DS_EXPORT WriteNodeOptions : public acero::ExecNodeOptions { + public: + explicit WriteNodeOptions( + FileSystemDatasetWriteOptions options, + std::shared_ptr custom_metadata = NULLPTR) + : write_options(std::move(options)), custom_metadata(std::move(custom_metadata)) {} + + /// \brief Options to control how to write the dataset + FileSystemDatasetWriteOptions write_options; + /// \brief Optional schema to attach to all written batches + /// + /// By default, we will use the output schema of the input. + /// + /// This can be used to alter schema metadata, field nullability, or field metadata. + /// However, this cannot be used to change the type of data. If the custom schema does + /// not have the same number of fields and the same data types as the input then the + /// plan will fail. + std::shared_ptr custom_schema; + /// \brief Optional metadata to attach to written batches + std::shared_ptr custom_metadata; +}; + +/// @} + +namespace internal { +ARROW_DS_EXPORT void InitializeDatasetWriter(arrow::acero::ExecFactoryRegistry* registry); +} + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_csv.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_csv.h new file mode 100644 index 0000000000000000000000000000000000000000..42e3fd7246988e625e0d2e69a29bd40c553e3219 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_csv.h @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/status.h" +#include "arrow/util/compression.h" + +namespace arrow { +namespace dataset { + +constexpr char kCsvTypeName[] = "csv"; + +/// \addtogroup dataset-file-formats +/// +/// @{ + +/// \brief A FileFormat implementation that reads from and writes to Csv files +class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { + public: + // TODO(ARROW-18328) Remove this, moved to CsvFragmentScanOptions + /// Options affecting the parsing of CSV files + csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); + + CsvFileFormat(); + + std::string type_name() const override { return kCsvTypeName; } + + bool Equals(const FileFormat& other) const override; + + Result IsSupported(const FileSource& source) const override; + + /// \brief Return the schema of the file if possible. + Result> Inspect(const FileSource& source) const override; + + Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& scan_options, + const std::shared_ptr& file) const override; + + Future> InspectFragment( + const FileSource& source, const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options) override; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +/// \brief Per-scan options for CSV fragments +struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions { + std::string type_name() const override { return kCsvTypeName; } + + using StreamWrapFunc = std::function>( + std::shared_ptr)>; + + /// CSV conversion options + csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults(); + + /// CSV reading options + /// + /// Note that use_threads is always ignored. + csv::ReadOptions read_options = csv::ReadOptions::Defaults(); + + /// CSV parse options + csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); + + /// Optional stream wrapping function + /// + /// If defined, all open dataset file fragments will be passed + /// through this function. One possible use case is to transparently + /// transcode all input files from a given character set to utf8. + StreamWrapFunc stream_transform_func{}; +}; + +class ARROW_DS_EXPORT CsvFileWriteOptions : public FileWriteOptions { + public: + /// Options passed to csv::MakeCSVWriter. + std::shared_ptr write_options; + + protected: + explicit CsvFileWriteOptions(std::shared_ptr format) + : FileWriteOptions(std::move(format)) {} + + friend class CsvFileFormat; +}; + +class ARROW_DS_EXPORT CsvFileWriter : public FileWriter { + public: + Status Write(const std::shared_ptr& batch) override; + + private: + CsvFileWriter(std::shared_ptr destination, + std::shared_ptr writer, + std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator); + + Future<> FinishInternal() override; + + std::shared_ptr destination_; + std::shared_ptr batch_writer_; + + friend class CsvFileFormat; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_ipc.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_ipc.h new file mode 100644 index 0000000000000000000000000000000000000000..0f7da82a0af5b1e58b724646853e8f482781778b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_ipc.h @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/io/type_fwd.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/result.h" + +namespace arrow { +namespace dataset { + +/// \addtogroup dataset-file-formats +/// +/// @{ + +constexpr char kIpcTypeName[] = "ipc"; + +/// \brief A FileFormat implementation that reads from and writes to Ipc files +class ARROW_DS_EXPORT IpcFileFormat : public FileFormat { + public: + std::string type_name() const override { return kIpcTypeName; } + + IpcFileFormat(); + + bool Equals(const FileFormat& other) const override { + return type_name() == other.type_name(); + } + + Result IsSupported(const FileSource& source) const override; + + /// \brief Return the schema of the file if possible. + Result> Inspect(const FileSource& source) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& options, + const std::shared_ptr& file) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options) override; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +/// \brief Per-scan options for IPC fragments +class ARROW_DS_EXPORT IpcFragmentScanOptions : public FragmentScanOptions { + public: + std::string type_name() const override { return kIpcTypeName; } + + /// Options passed to the IPC file reader. + /// included_fields, memory_pool, and use_threads are ignored. + std::shared_ptr options; + /// If present, the async scanner will enable I/O coalescing. + /// This is ignored by the sync scanner. + std::shared_ptr cache_options; +}; + +class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions { + public: + /// Options passed to ipc::MakeFileWriter. use_threads is ignored + std::shared_ptr options; + + /// custom_metadata written to the file's footer + std::shared_ptr metadata; + + protected: + explicit IpcFileWriteOptions(std::shared_ptr format) + : FileWriteOptions(std::move(format)) {} + + friend class IpcFileFormat; +}; + +class ARROW_DS_EXPORT IpcFileWriter : public FileWriter { + public: + Status Write(const std::shared_ptr& batch) override; + + private: + IpcFileWriter(std::shared_ptr destination, + std::shared_ptr writer, + std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator); + + Future<> FinishInternal() override; + + std::shared_ptr destination_; + std::shared_ptr batch_writer_; + + friend class IpcFileFormat; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_json.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_json.h new file mode 100644 index 0000000000000000000000000000000000000000..4b8112d87095ccc9d02b0c52b4df2b1e674b8cc5 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_json.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/json/options.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/future.h" +#include "arrow/util/macros.h" + +namespace arrow::dataset { + +/// \addtogroup dataset-file-formats +/// +/// @{ + +constexpr char kJsonTypeName[] = "json"; + +/// \brief A FileFormat implementation that reads from JSON files +class ARROW_DS_EXPORT JsonFileFormat : public FileFormat { + public: + JsonFileFormat(); + + std::string type_name() const override { return kJsonTypeName; } + + bool Equals(const FileFormat& other) const override; + + Result IsSupported(const FileSource& source) const override; + + Result> Inspect(const FileSource& source) const override; + + Future> InspectFragment( + const FileSource& source, const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const override; + + Future> BeginScan( + const FragmentScanRequest& scan_request, const InspectedFragment& inspected, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& scan_options, + const std::shared_ptr& file) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& scan_options) override; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override { + return Status::NotImplemented("Writing JSON files is not currently supported"); + } + + std::shared_ptr DefaultWriteOptions() override { return NULLPTR; } +}; + +/// \brief Per-scan options for JSON fragments +struct ARROW_DS_EXPORT JsonFragmentScanOptions : public FragmentScanOptions { + std::string type_name() const override { return kJsonTypeName; } + + /// @brief Options that affect JSON parsing + /// + /// Note: `explicit_schema` and `unexpected_field_behavior` are ignored. + json::ParseOptions parse_options = json::ParseOptions::Defaults(); + + /// @brief Options that affect JSON reading + json::ReadOptions read_options = json::ReadOptions::Defaults(); +}; + +/// @} + +} // namespace arrow::dataset diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_orc.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_orc.h new file mode 100644 index 0000000000000000000000000000000000000000..5bfefd1e02b5cccf74cf8ade579a937341aef013 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_orc.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/io/type_fwd.h" +#include "arrow/result.h" + +namespace arrow { +namespace dataset { + +/// \addtogroup dataset-file-formats +/// +/// @{ + +constexpr char kOrcTypeName[] = "orc"; + +/// \brief A FileFormat implementation that reads from and writes to ORC files +class ARROW_DS_EXPORT OrcFileFormat : public FileFormat { + public: + OrcFileFormat(); + + std::string type_name() const override { return kOrcTypeName; } + + bool Equals(const FileFormat& other) const override { + return type_name() == other.type_name(); + } + + Result IsSupported(const FileSource& source) const override; + + /// \brief Return the schema of the file if possible. + Result> Inspect(const FileSource& source) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& options, + const std::shared_ptr& file) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options) override; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_parquet.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_parquet.h new file mode 100644 index 0000000000000000000000000000000000000000..63d8fd729223cdf8813d074c731784368e01a89e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_parquet.h @@ -0,0 +1,404 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/dataset/discovery.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/io/caching.h" + +namespace parquet { +class ParquetFileReader; +class Statistics; +class ColumnChunkMetaData; +class RowGroupMetaData; +class FileMetaData; +class FileDecryptionProperties; +class FileEncryptionProperties; + +class ReaderProperties; +class ArrowReaderProperties; + +class WriterProperties; +class ArrowWriterProperties; + +namespace arrow { +class FileReader; +class FileWriter; +struct SchemaManifest; +} // namespace arrow +} // namespace parquet + +namespace arrow { +namespace dataset { + +struct ParquetDecryptionConfig; +struct ParquetEncryptionConfig; + +/// \addtogroup dataset-file-formats +/// +/// @{ + +constexpr char kParquetTypeName[] = "parquet"; + +/// \brief A FileFormat implementation that reads from Parquet files +class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { + public: + ParquetFileFormat(); + + /// Convenience constructor which copies properties from a parquet::ReaderProperties. + /// memory_pool will be ignored. + explicit ParquetFileFormat(const parquet::ReaderProperties& reader_properties); + + std::string type_name() const override { return kParquetTypeName; } + + bool Equals(const FileFormat& other) const override; + + struct ReaderOptions { + /// \defgroup parquet-file-format-arrow-reader-properties properties which correspond + /// to members of parquet::ArrowReaderProperties. + /// + /// We don't embed parquet::ReaderProperties directly because column names (rather + /// than indices) are used to indicate dictionary columns, and other options are + /// deferred to scan time. + /// + /// @{ + std::unordered_set dict_columns; + arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO; + /// @} + } reader_options; + + Result IsSupported(const FileSource& source) const override; + + /// \brief Return the schema of the file if possible. + Result> Inspect(const FileSource& source) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& options, + const std::shared_ptr& file) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options) override; + + using FileFormat::MakeFragment; + + /// \brief Create a Fragment targeting all RowGroups. + Result> MakeFragment( + FileSource source, compute::Expression partition_expression, + std::shared_ptr physical_schema) override; + + /// \brief Create a Fragment, restricted to the specified row groups. + Result> MakeFragment( + FileSource source, compute::Expression partition_expression, + std::shared_ptr physical_schema, std::vector row_groups); + + /// \brief Return a FileReader on the given source. + Result> GetReader( + const FileSource& source, const std::shared_ptr& options) const; + + Result> GetReader( + const FileSource& source, const std::shared_ptr& options, + const std::shared_ptr& metadata) const; + + Future> GetReaderAsync( + const FileSource& source, const std::shared_ptr& options) const; + + Future> GetReaderAsync( + const FileSource& source, const std::shared_ptr& options, + const std::shared_ptr& metadata) const; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +/// \brief A FileFragment with parquet logic. +/// +/// ParquetFileFragment provides a lazy (with respect to IO) interface to +/// scan parquet files. Any heavy IO calls are deferred to the Scan() method. +/// +/// The caller can provide an optional list of selected RowGroups to limit the +/// number of scanned RowGroups, or to partition the scans across multiple +/// threads. +/// +/// Metadata can be explicitly provided, enabling pushdown predicate benefits without +/// the potentially heavy IO of loading Metadata from the file system. This can induce +/// significant performance boost when scanning high latency file systems. +class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { + public: + Result SplitByRowGroup(compute::Expression predicate); + + /// \brief Return the RowGroups selected by this fragment. + const std::vector& row_groups() const { + if (row_groups_) return *row_groups_; + static std::vector empty; + return empty; + } + + /// \brief Return the FileMetaData associated with this fragment. + std::shared_ptr metadata(); + + /// \brief Ensure this fragment's FileMetaData is in memory. + Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR); + + /// \brief Return fragment which selects a filtered subset of this fragment's RowGroups. + Result> Subset(compute::Expression predicate); + Result> Subset(std::vector row_group_ids); + + static std::optional EvaluateStatisticsAsExpression( + const Field& field, const parquet::Statistics& statistics); + + static std::optional EvaluateStatisticsAsExpression( + const Field& field, const FieldRef& field_ref, + const parquet::Statistics& statistics); + + private: + ParquetFileFragment(FileSource source, std::shared_ptr format, + compute::Expression partition_expression, + std::shared_ptr physical_schema, + std::optional> row_groups); + + Status SetMetadata(std::shared_ptr metadata, + std::shared_ptr manifest, + std::shared_ptr original_metadata = {}); + + // Overridden to opportunistically set metadata since a reader must be opened anyway. + Result> ReadPhysicalSchemaImpl() override { + ARROW_RETURN_NOT_OK(EnsureCompleteMetadata()); + return physical_schema_; + } + + /// Return a filtered subset of row group indices. + Result> FilterRowGroups(compute::Expression predicate); + /// Simplify the predicate against the statistics of each row group. + Result> TestRowGroups(compute::Expression predicate); + /// Try to count rows matching the predicate using metadata. Expects + /// metadata to be present, and expects the predicate to have been + /// simplified against the partition expression already. + Result> TryCountRows(compute::Expression predicate); + + ParquetFileFormat& parquet_format_; + + /// Indices of row groups selected by this fragment, + /// or std::nullopt if all row groups are selected. + std::optional> row_groups_; + + // the expressions (combined for all columns for which statistics have been + // processed) are stored per column group + std::vector statistics_expressions_; + // statistics status are kept track of by Parquet Schema column indices + // (i.e. not Arrow schema field index) + std::vector statistics_expressions_complete_; + std::shared_ptr metadata_; + std::shared_ptr manifest_; + // The FileMetaData that owns the SchemaDescriptor pointed by SchemaManifest. + std::shared_ptr original_metadata_; + + friend class ParquetFileFormat; + friend class ParquetDatasetFactory; +}; + +/// \brief Per-scan options for Parquet fragments +class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions { + public: + ParquetFragmentScanOptions(); + std::string type_name() const override { return kParquetTypeName; } + + /// Reader properties. Not all properties are respected: memory_pool comes from + /// ScanOptions. + std::shared_ptr reader_properties; + /// Arrow reader properties. Not all properties are respected: batch_size comes from + /// ScanOptions. Additionally, dictionary columns come from + /// ParquetFileFormat::ReaderOptions::dict_columns. + std::shared_ptr arrow_reader_properties; + /// A configuration structure that provides decryption properties for a dataset + std::shared_ptr parquet_decryption_config = NULLPTR; +}; + +class ARROW_DS_EXPORT ParquetFileWriteOptions : public FileWriteOptions { + public: + /// \brief Parquet writer properties. + std::shared_ptr writer_properties; + + /// \brief Parquet Arrow writer properties. + std::shared_ptr arrow_writer_properties; + + // A configuration structure that provides encryption properties for a dataset + std::shared_ptr parquet_encryption_config = NULLPTR; + + protected: + explicit ParquetFileWriteOptions(std::shared_ptr format) + : FileWriteOptions(std::move(format)) {} + + friend class ParquetFileFormat; +}; + +class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter { + public: + const std::shared_ptr& parquet_writer() const { + return parquet_writer_; + } + + Status Write(const std::shared_ptr& batch) override; + + private: + ParquetFileWriter(std::shared_ptr destination, + std::shared_ptr writer, + std::shared_ptr options, + fs::FileLocator destination_locator); + + Future<> FinishInternal() override; + + std::shared_ptr parquet_writer_; + + friend class ParquetFileFormat; +}; + +/// \brief Options for making a FileSystemDataset from a Parquet _metadata file. +struct ParquetFactoryOptions { + /// Either an explicit Partitioning or a PartitioningFactory to discover one. + /// + /// If a factory is provided, it will be used to infer a schema for partition fields + /// based on file and directory paths then construct a Partitioning. The default + /// is a Partitioning which will yield no partition information. + /// + /// The (explicit or discovered) partitioning will be applied to discovered files + /// and the resulting partition information embedded in the Dataset. + PartitioningOrFactory partitioning{Partitioning::Default()}; + + /// For the purposes of applying the partitioning, paths will be stripped + /// of the partition_base_dir. Files not matching the partition_base_dir + /// prefix will be skipped for partition discovery. The ignored files will still + /// be part of the Dataset, but will not have partition information. + /// + /// Example: + /// partition_base_dir = "/dataset"; + /// + /// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning + /// + /// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery. + /// + /// This is useful for partitioning which parses directory when ordering + /// is important, e.g. DirectoryPartitioning. + std::string partition_base_dir; + + /// Assert that all ColumnChunk paths are consistent. The parquet spec allows for + /// ColumnChunk data to be stored in multiple files, but ParquetDatasetFactory + /// supports only a single file with all ColumnChunk data. If this flag is set + /// construction of a ParquetDatasetFactory will raise an error if ColumnChunk + /// data is not resident in a single file. + bool validate_column_chunk_paths = false; +}; + +/// \brief Create FileSystemDataset from custom `_metadata` cache file. +/// +/// Dask and other systems will generate a cache metadata file by concatenating +/// the RowGroupMetaData of multiple parquet files into a single parquet file +/// that only contains metadata and no ColumnChunk data. +/// +/// ParquetDatasetFactory creates a FileSystemDataset composed of +/// ParquetFileFragment where each fragment is pre-populated with the exact +/// number of row groups and statistics for each columns. +class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory { + public: + /// \brief Create a ParquetDatasetFactory from a metadata path. + /// + /// The `metadata_path` will be read from `filesystem`. Each RowGroup + /// contained in the metadata file will be relative to `dirname(metadata_path)`. + /// + /// \param[in] metadata_path path of the metadata parquet file + /// \param[in] filesystem from which to open/read the path + /// \param[in] format to read the file with. + /// \param[in] options see ParquetFactoryOptions + static Result> Make( + const std::string& metadata_path, std::shared_ptr filesystem, + std::shared_ptr format, ParquetFactoryOptions options); + + /// \brief Create a ParquetDatasetFactory from a metadata source. + /// + /// Similar to the previous Make definition, but the metadata can be a Buffer + /// and the base_path is explicit instead of inferred from the metadata + /// path. + /// + /// \param[in] metadata source to open the metadata parquet file from + /// \param[in] base_path used as the prefix of every parquet files referenced + /// \param[in] filesystem from which to read the files referenced. + /// \param[in] format to read the file with. + /// \param[in] options see ParquetFactoryOptions + static Result> Make( + const FileSource& metadata, const std::string& base_path, + std::shared_ptr filesystem, + std::shared_ptr format, ParquetFactoryOptions options); + + Result>> InspectSchemas( + InspectOptions options) override; + + Result> Finish(FinishOptions options) override; + + protected: + ParquetDatasetFactory( + std::shared_ptr filesystem, + std::shared_ptr format, + std::shared_ptr metadata, + std::shared_ptr manifest, + std::shared_ptr physical_schema, std::string base_path, + ParquetFactoryOptions options, + std::vector>> paths_with_row_group_ids) + : filesystem_(std::move(filesystem)), + format_(std::move(format)), + metadata_(std::move(metadata)), + manifest_(std::move(manifest)), + physical_schema_(std::move(physical_schema)), + base_path_(std::move(base_path)), + options_(std::move(options)), + paths_with_row_group_ids_(std::move(paths_with_row_group_ids)) {} + + std::shared_ptr filesystem_; + std::shared_ptr format_; + std::shared_ptr metadata_; + std::shared_ptr manifest_; + std::shared_ptr physical_schema_; + std::string base_path_; + ParquetFactoryOptions options_; + std::vector>> paths_with_row_group_ids_; + + private: + Result>> CollectParquetFragments( + const Partitioning& partitioning); + + Result> PartitionSchema(); +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/parquet_encryption_config.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/parquet_encryption_config.h new file mode 100644 index 0000000000000000000000000000000000000000..96200b8a3118b82c92977d222ba8775f61a02b0b --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/parquet_encryption_config.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/dataset/type_fwd.h" + +namespace parquet::encryption { +class CryptoFactory; +struct KmsConnectionConfig; +struct EncryptionConfiguration; +struct DecryptionConfiguration; +} // namespace parquet::encryption + +namespace arrow { +namespace dataset { + +/// \brief Core configuration class encapsulating parameters for high-level encryption +/// within Parquet framework. +/// +/// ParquetEncryptionConfig serves as a bridge, passing encryption-related +/// parameters to appropriate components within the Parquet library. It holds references +/// to objects defining encryption strategy, Key Management Service (KMS) configuration, +/// and specific encryption configurations for Parquet data. +struct ARROW_DS_EXPORT ParquetEncryptionConfig { + /// Shared pointer to CryptoFactory object, responsible for creating cryptographic + /// components like encryptors and decryptors. + std::shared_ptr crypto_factory; + + /// Shared pointer to KmsConnectionConfig object, holding configuration parameters for + /// connecting to a Key Management Service (KMS). + std::shared_ptr kms_connection_config; + + /// Shared pointer to EncryptionConfiguration object, defining specific encryption + /// settings for Parquet data, like keys for different columns. + std::shared_ptr encryption_config; +}; + +/// \brief Core configuration class encapsulating parameters for high-level decryption +/// within Parquet framework. +/// +/// ParquetDecryptionConfig is designed to pass decryption-related parameters to +/// appropriate decryption components within Parquet library. It holds references to +/// objects defining decryption strategy, Key Management Service (KMS) configuration, +/// and specific decryption configurations for reading encrypted Parquet data. +struct ARROW_DS_EXPORT ParquetDecryptionConfig { + /// Shared pointer to CryptoFactory object, pivotal in creating cryptographic + /// components for decryption process. + std::shared_ptr crypto_factory; + + /// Shared pointer to KmsConnectionConfig object, containing parameters for connecting + /// to a Key Management Service (KMS) during decryption. + std::shared_ptr kms_connection_config; + + /// Shared pointer to DecryptionConfiguration object, specifying decryption settings + /// for reading encrypted Parquet data. + std::shared_ptr decryption_config; +}; + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/partition.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/partition.h new file mode 100644 index 0000000000000000000000000000000000000000..315a3d384d28c1b313bf1483fb38ad99c6713663 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/partition.h @@ -0,0 +1,432 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/compute/expression.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/util/compare.h" + +namespace arrow { + +namespace dataset { + +constexpr char kFilenamePartitionSep = '_'; + +struct ARROW_DS_EXPORT PartitionPathFormat { + std::string directory, filename; +}; + +// ---------------------------------------------------------------------- +// Partitioning + +/// \defgroup dataset-partitioning Partitioning API +/// +/// @{ + +/// \brief Interface for parsing partition expressions from string partition +/// identifiers. +/// +/// For example, the identifier "foo=5" might be parsed to an equality expression +/// between the "foo" field and the value 5. +/// +/// Some partitionings may store the field names in a metadata +/// store instead of in file paths, for example +/// dataset_root/2009/11/... could be used when the partition fields +/// are "year" and "month" +/// +/// Paths are consumed from left to right. Paths must be relative to +/// the root of a partition; path prefixes must be removed before passing +/// the path to a partitioning for parsing. +class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable { + public: + virtual ~Partitioning() = default; + + /// \brief The name identifying the kind of partitioning + virtual std::string type_name() const = 0; + + //// \brief Return whether the partitionings are equal + virtual bool Equals(const Partitioning& other) const { + return schema_->Equals(other.schema_, /*check_metadata=*/false); + } + + /// \brief If the input batch shares any fields with this partitioning, + /// produce sub-batches which satisfy mutually exclusive Expressions. + struct PartitionedBatches { + RecordBatchVector batches; + std::vector expressions; + }; + virtual Result Partition( + const std::shared_ptr& batch) const = 0; + + /// \brief Parse a path into a partition expression + virtual Result Parse(const std::string& path) const = 0; + + virtual Result Format(const compute::Expression& expr) const = 0; + + /// \brief A default Partitioning which is a DirectoryPartitioning + /// with an empty schema. + static std::shared_ptr Default(); + + /// \brief The partition schema. + const std::shared_ptr& schema() const { return schema_; } + + protected: + explicit Partitioning(std::shared_ptr schema) : schema_(std::move(schema)) {} + + std::shared_ptr schema_; +}; + +/// \brief The encoding of partition segments. +enum class SegmentEncoding : int8_t { + /// No encoding. + None = 0, + /// Segment values are URL-encoded. + Uri = 1, +}; + +ARROW_DS_EXPORT +std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding); + +/// \brief Options for key-value based partitioning (hive/directory). +struct ARROW_DS_EXPORT KeyValuePartitioningOptions { + /// After splitting a path into components, decode the path components + /// before parsing according to this scheme. + SegmentEncoding segment_encoding = SegmentEncoding::Uri; +}; + +/// \brief Options for inferring a partitioning. +struct ARROW_DS_EXPORT PartitioningFactoryOptions { + /// When inferring a schema for partition fields, yield dictionary encoded types + /// instead of plain. This can be more efficient when materializing virtual + /// columns, and Expressions parsed by the finished Partitioning will include + /// dictionaries of all unique inspected values for each field. + bool infer_dictionary = false; + /// Optionally, an expected schema can be provided, in which case inference + /// will only check discovered fields against the schema and update internal + /// state (such as dictionaries). + std::shared_ptr schema; + /// After splitting a path into components, decode the path components + /// before parsing according to this scheme. + SegmentEncoding segment_encoding = SegmentEncoding::Uri; + + KeyValuePartitioningOptions AsPartitioningOptions() const; +}; + +/// \brief Options for inferring a hive-style partitioning. +struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions { + /// The hive partitioning scheme maps null to a hard coded fallback string. + std::string null_fallback; + + HivePartitioningOptions AsHivePartitioningOptions() const; +}; + +/// \brief PartitioningFactory provides creation of a partitioning when the +/// specific schema must be inferred from available paths (no explicit schema is known). +class ARROW_DS_EXPORT PartitioningFactory { + public: + virtual ~PartitioningFactory() = default; + + /// \brief The name identifying the kind of partitioning + virtual std::string type_name() const = 0; + + /// Get the schema for the resulting Partitioning. + /// This may reset internal state, for example dictionaries of unique representations. + virtual Result> Inspect( + const std::vector& paths) = 0; + + /// Create a partitioning using the provided schema + /// (fields may be dropped). + virtual Result> Finish( + const std::shared_ptr& schema) const = 0; +}; + +/// \brief Subclass for the common case of a partitioning which yields an equality +/// expression for each segment +class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { + public: + /// An unconverted equality expression consisting of a field name and the representation + /// of a scalar value + struct Key { + std::string name; + std::optional value; + }; + + Result Partition( + const std::shared_ptr& batch) const override; + + Result Parse(const std::string& path) const override; + + Result Format(const compute::Expression& expr) const override; + + const ArrayVector& dictionaries() const { return dictionaries_; } + + SegmentEncoding segment_encoding() const { return options_.segment_encoding; } + + bool Equals(const Partitioning& other) const override; + + protected: + KeyValuePartitioning(std::shared_ptr schema, ArrayVector dictionaries, + KeyValuePartitioningOptions options) + : Partitioning(std::move(schema)), + dictionaries_(std::move(dictionaries)), + options_(options) { + if (dictionaries_.empty()) { + dictionaries_.resize(schema_->num_fields()); + } + } + + virtual Result> ParseKeys(const std::string& path) const = 0; + + virtual Result FormatValues(const ScalarVector& values) const = 0; + + /// Convert a Key to a full expression. + Result ConvertKey(const Key& key) const; + + Result> FormatPartitionSegments( + const ScalarVector& values) const; + Result> ParsePartitionSegments( + const std::vector& segments) const; + + ArrayVector dictionaries_; + KeyValuePartitioningOptions options_; +}; + +/// \brief DirectoryPartitioning parses one segment of a path for each field in its +/// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse +/// must contain segments for each field. +/// +/// For example given schema the path "/2009/11" would be +/// parsed to ("year"_ == 2009 and "month"_ == 11) +class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning { + public: + /// If a field in schema is of dictionary type, the corresponding element of + /// dictionaries must be contain the dictionary of values for that field. + explicit DirectoryPartitioning(std::shared_ptr schema, + ArrayVector dictionaries = {}, + KeyValuePartitioningOptions options = {}); + + std::string type_name() const override { return "directory"; } + + bool Equals(const Partitioning& other) const override; + + /// \brief Create a factory for a directory partitioning. + /// + /// \param[in] field_names The names for the partition fields. Types will be + /// inferred. + static std::shared_ptr MakeFactory( + std::vector field_names, PartitioningFactoryOptions = {}); + + private: + Result> ParseKeys(const std::string& path) const override; + + Result FormatValues(const ScalarVector& values) const override; +}; + +/// \brief The default fallback used for null values in a Hive-style partitioning. +static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__"; + +struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions { + std::string null_fallback = kDefaultHiveNullFallback; + + static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) { + HivePartitioningOptions options; + options.null_fallback = std::move(fallback); + return options; + } +}; + +/// \brief Multi-level, directory based partitioning +/// originating from Apache Hive with all data files stored in the +/// leaf directories. Data is partitioned by static values of a +/// particular column in the schema. Partition keys are represented in +/// the form $key=$value in directory names. +/// Field order is ignored, as are missing or unrecognized field names. +/// +/// For example given schema the path +/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321) +class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning { + public: + /// If a field in schema is of dictionary type, the corresponding element of + /// dictionaries must be contain the dictionary of values for that field. + explicit HivePartitioning(std::shared_ptr schema, ArrayVector dictionaries = {}, + std::string null_fallback = kDefaultHiveNullFallback) + : KeyValuePartitioning(std::move(schema), std::move(dictionaries), + KeyValuePartitioningOptions()), + hive_options_( + HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) { + } + + explicit HivePartitioning(std::shared_ptr schema, ArrayVector dictionaries, + HivePartitioningOptions options) + : KeyValuePartitioning(std::move(schema), std::move(dictionaries), options), + hive_options_(options) {} + + std::string type_name() const override { return "hive"; } + std::string null_fallback() const { return hive_options_.null_fallback; } + const HivePartitioningOptions& options() const { return hive_options_; } + + static Result> ParseKey(const std::string& segment, + const HivePartitioningOptions& options); + + bool Equals(const Partitioning& other) const override; + + /// \brief Create a factory for a hive partitioning. + static std::shared_ptr MakeFactory( + HivePartitioningFactoryOptions = {}); + + private: + const HivePartitioningOptions hive_options_; + Result> ParseKeys(const std::string& path) const override; + + Result FormatValues(const ScalarVector& values) const override; +}; + +/// \brief Implementation provided by lambda or other callable +class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning { + public: + using ParseImpl = std::function(const std::string&)>; + + using FormatImpl = + std::function(const compute::Expression&)>; + + FunctionPartitioning(std::shared_ptr schema, ParseImpl parse_impl, + FormatImpl format_impl = NULLPTR, std::string name = "function") + : Partitioning(std::move(schema)), + parse_impl_(std::move(parse_impl)), + format_impl_(std::move(format_impl)), + name_(std::move(name)) {} + + std::string type_name() const override { return name_; } + + bool Equals(const Partitioning& other) const override { return false; } + + Result Parse(const std::string& path) const override { + return parse_impl_(path); + } + + Result Format(const compute::Expression& expr) const override { + if (format_impl_) { + return format_impl_(expr); + } + return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning"); + } + + Result Partition( + const std::shared_ptr& batch) const override { + return Status::NotImplemented("partitioning batches from ", type_name(), + " Partitioning"); + } + + private: + ParseImpl parse_impl_; + FormatImpl format_impl_; + std::string name_; +}; + +class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { + public: + /// \brief Construct a FilenamePartitioning from its components. + /// + /// If a field in schema is of dictionary type, the corresponding element of + /// dictionaries must be contain the dictionary of values for that field. + explicit FilenamePartitioning(std::shared_ptr schema, + ArrayVector dictionaries = {}, + KeyValuePartitioningOptions options = {}); + + std::string type_name() const override { return "filename"; } + + /// \brief Create a factory for a filename partitioning. + /// + /// \param[in] field_names The names for the partition fields. Types will be + /// inferred. + static std::shared_ptr MakeFactory( + std::vector field_names, PartitioningFactoryOptions = {}); + + bool Equals(const Partitioning& other) const override; + + private: + Result> ParseKeys(const std::string& path) const override; + + Result FormatValues(const ScalarVector& values) const override; +}; + +ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, + const std::string& prefix); + +/// \brief Extracts the directory and filename and removes the prefix of a path +/// +/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> +/// {"year=2019","c.txt"}` +ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path, + const std::string& prefix); + +/// \brief Vector version of StripPrefixAndFilename. +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( + const std::vector& paths, const std::string& prefix); + +/// \brief Vector version of StripPrefixAndFilename. +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( + const std::vector& files, const std::string& prefix); + +/// \brief Either a Partitioning or a PartitioningFactory +class ARROW_DS_EXPORT PartitioningOrFactory { + public: + explicit PartitioningOrFactory(std::shared_ptr partitioning) + : partitioning_(std::move(partitioning)) {} + + explicit PartitioningOrFactory(std::shared_ptr factory) + : factory_(std::move(factory)) {} + + PartitioningOrFactory& operator=(std::shared_ptr partitioning) { + return *this = PartitioningOrFactory(std::move(partitioning)); + } + + PartitioningOrFactory& operator=(std::shared_ptr factory) { + return *this = PartitioningOrFactory(std::move(factory)); + } + + /// \brief The partitioning (if given). + const std::shared_ptr& partitioning() const { return partitioning_; } + + /// \brief The partition factory (if given). + const std::shared_ptr& factory() const { return factory_; } + + /// \brief Get the partition schema, inferring it with the given factory if needed. + Result> GetOrInferSchema(const std::vector& paths); + + private: + std::shared_ptr factory_; + std::shared_ptr partitioning_; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/pch.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/pch.h new file mode 100644 index 0000000000000000000000000000000000000000..a74fd96e3554e660c7bd01fcbd07974af8b68c98 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/pch.h @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Often-used headers, for precompiling. +// If updating this header, please make sure you check compilation speed +// before checking in. Adding headers which are not used extremely often +// may incur a slowdown, since it makes the precompiled header heavier to load. + +// This API is EXPERIMENTAL. + +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/scanner.h" +#include "arrow/pch.h" diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/plan.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/plan.h new file mode 100644 index 0000000000000000000000000000000000000000..10260ccec81d159ffd40d86144e39c4d91739db1 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/plan.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { +namespace internal { + +/// Register dataset-based exec nodes with the exec node registry +/// +/// This function must be called before using dataset ExecNode factories +ARROW_DS_EXPORT void Initialize(); + +} // namespace internal +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/projector.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/projector.h new file mode 100644 index 0000000000000000000000000000000000000000..86d38f0af23522a08dcebc1c290fe6bc25ae014e --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/projector.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include "arrow/dataset/visibility.h" +#include "arrow/type_fwd.h" + +namespace arrow { +namespace dataset { + +// FIXME this is superceded by compute::Expression::Bind +ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to); + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/scanner.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/scanner.h new file mode 100644 index 0000000000000000000000000000000000000000..d2de267897180f138792d154c59d393f92832e21 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/scanner.h @@ -0,0 +1,583 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/acero/options.h" +#include "arrow/compute/expression.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/projector.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/io/interfaces.h" +#include "arrow/memory_pool.h" +#include "arrow/type_fwd.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/iterator.h" +#include "arrow/util/thread_pool.h" +#include "arrow/util/type_fwd.h" + +namespace arrow { + +using RecordBatchGenerator = std::function>()>; + +namespace dataset { + +/// \defgroup dataset-scanning Scanning API +/// +/// @{ + +constexpr int64_t kDefaultBatchSize = 1 << 17; // 128Ki rows +// This will yield 64 batches ~ 8Mi rows +constexpr int32_t kDefaultBatchReadahead = 16; +constexpr int32_t kDefaultFragmentReadahead = 4; +constexpr int32_t kDefaultBytesReadahead = 1 << 25; // 32MiB + +/// Scan-specific options, which can be changed between scans of the same dataset. +struct ARROW_DS_EXPORT ScanOptions { + /// A row filter (which will be pushed down to partitioning/reading if supported). + compute::Expression filter = compute::literal(true); + /// A projection expression (which can add/remove/rename columns). + compute::Expression projection; + + /// Schema with which batches will be read from fragments. This is also known as the + /// "reader schema" it will be used (for example) in constructing CSV file readers to + /// identify column types for parsing. Usually only a subset of its fields (see + /// MaterializedFields) will be materialized during a scan. + std::shared_ptr dataset_schema; + + /// Schema of projected record batches. This is independent of dataset_schema as its + /// fields are derived from the projection. For example, let + /// + /// dataset_schema = {"a": int32, "b": int32, "id": utf8} + /// projection = project({equal(field_ref("a"), field_ref("b"))}, {"a_plus_b"}) + /// + /// (no filter specified). In this case, the projected_schema would be + /// + /// {"a_plus_b": int32} + std::shared_ptr projected_schema; + + /// Maximum row count for scanned batches. + int64_t batch_size = kDefaultBatchSize; + + /// How many batches to read ahead within a fragment. + /// + /// Set to 0 to disable batch readahead + /// + /// Note: May not be supported by all formats + /// Note: Will be ignored if use_threads is set to false + int32_t batch_readahead = kDefaultBatchReadahead; + + /// How many files to read ahead + /// + /// Set to 0 to disable fragment readahead + /// + /// Note: May not be enforced by all scanners + /// Note: Will be ignored if use_threads is set to false + int32_t fragment_readahead = kDefaultFragmentReadahead; + + /// A pool from which materialized and scanned arrays will be allocated. + MemoryPool* pool = arrow::default_memory_pool(); + + /// IOContext for any IO tasks + /// + /// Note: The IOContext executor will be ignored if use_threads is set to false + io::IOContext io_context; + + /// If true the scanner will scan in parallel + /// + /// Note: If true, this will use threads from both the cpu_executor and the + /// io_context.executor + /// Note: This must be true in order for any readahead to happen + bool use_threads = false; + + /// If true the scanner will add augmented fields to the output schema. + bool add_augmented_fields = true; + + /// Fragment-specific scan options. + std::shared_ptr fragment_scan_options; + + /// Return a vector of FieldRefs that require materialization. + /// + /// This is usually the union of the fields referenced in the projection and the + /// filter expression. Examples: + /// + /// - `SELECT a, b WHERE a < 2 && c > 1` => ["a", "b", "a", "c"] + /// - `SELECT a + b < 3 WHERE a > 1` => ["a", "b", "a"] + /// + /// This is needed for expression where a field may not be directly + /// used in the final projection but is still required to evaluate the + /// expression. + /// + /// This is used by Fragment implementations to apply the column + /// sub-selection optimization. + std::vector MaterializedFields() const; + + /// Parameters which control when the plan should pause for a slow consumer + acero::BackpressureOptions backpressure = + acero::BackpressureOptions::DefaultBackpressure(); +}; + +/// Scan-specific options, which can be changed between scans of the same dataset. +/// +/// A dataset consists of one or more individual fragments. A fragment is anything +/// that is independently scannable, often a file. +/// +/// Batches from all fragments will be converted to a single schema. This unified +/// schema is referred to as the "dataset schema" and is the output schema for +/// this node. +/// +/// Individual fragments may have schemas that are different from the dataset +/// schema. This is sometimes referred to as the physical or fragment schema. +/// Conversion from the fragment schema to the dataset schema is a process +/// known as evolution. +struct ARROW_DS_EXPORT ScanV2Options : public acero::ExecNodeOptions { + explicit ScanV2Options(std::shared_ptr dataset) + : dataset(std::move(dataset)) {} + + /// \brief The dataset to scan + std::shared_ptr dataset; + /// \brief A row filter + /// + /// The filter expression should be written against the dataset schema. + /// The filter must be unbound. + /// + /// This is an opportunistic pushdown filter. Filtering capabilities will + /// vary between formats. If a format is not capable of applying the filter + /// then it will ignore it. + /// + /// Each fragment will do its best to filter the data based on the information + /// (partitioning guarantees, statistics) available to it. If it is able to + /// apply some filtering then it will indicate what filtering it was able to + /// apply by attaching a guarantee to the batch. + /// + /// For example, if a filter is x < 50 && y > 40 then a batch may be able to + /// apply a guarantee x < 50. Post-scan filtering would then only need to + /// consider y > 40 (for this specific batch). The next batch may not be able + /// to attach any guarantee and both clauses would need to be applied to that batch. + /// + /// A single guarantee-aware filtering operation should generally be applied to all + /// resulting batches. The scan node is not responsible for this. + /// + /// Fields that are referenced by the filter should be included in the `columns` vector. + /// The scan node will not automatically fetch fields referenced by the filter + /// expression. \see AddFieldsNeededForFilter + /// + /// If the filter references fields that are not included in `columns` this may or may + /// not be an error, depending on the format. + compute::Expression filter = compute::literal(true); + + /// \brief The columns to scan + /// + /// This is not a simple list of top-level column indices but instead a set of paths + /// allowing for partial selection of columns + /// + /// These paths refer to the dataset schema + /// + /// For example, consider the following dataset schema: + /// schema({ + /// field("score", int32()), + /// "marker", struct_({ + /// field("color", utf8()), + /// field("location", struct_({ + /// field("x", float64()), + /// field("y", float64()) + /// }) + /// }) + /// }) + /// + /// If `columns` is {{0}, {1,1,0}} then the output schema is: + /// schema({field("score", int32()), field("x", float64())}) + /// + /// If `columns` is {{1,1,1}, {1,1}} then the output schema is: + /// schema({ + /// field("y", float64()), + /// field("location", struct_({ + /// field("x", float64()), + /// field("y", float64()) + /// }) + /// }) + std::vector columns; + + /// \brief Target number of bytes to read ahead in a fragment + /// + /// This limit involves some amount of estimation. Formats typically only know + /// batch boundaries in terms of rows (not decoded bytes) and so an estimation + /// must be done to guess the average row size. Other formats like CSV and JSON + /// must make even more generalized guesses. + /// + /// This is a best-effort guide. Some formats may need to read ahead further, + /// for example, if scanning a parquet file that has batches with 100MiB of data + /// then the actual readahead will be at least 100MiB + /// + /// Set to 0 to disable readahead. When disabled, the scanner will read the + /// dataset one batch at a time + /// + /// This limit applies across all fragments. If the limit is 32MiB and the + /// fragment readahead allows for 20 fragments to be read at once then the + /// total readahead will still be 32MiB and NOT 20 * 32MiB. + int32_t target_bytes_readahead = kDefaultBytesReadahead; + + /// \brief Number of fragments to read ahead + /// + /// Higher readahead will potentially lead to more efficient I/O but will lead + /// to the scan operation using more RAM. The default is fairly conservative + /// and designed for fast local disks (or slow local spinning disks which cannot + /// handle much parallelism anyways). When using a highly parallel remote filesystem + /// you will likely want to increase these values. + /// + /// Set to 0 to disable fragment readahead. When disabled the dataset will be scanned + /// one fragment at a time. + int32_t fragment_readahead = kDefaultFragmentReadahead; + /// \brief Options specific to the file format + const FragmentScanOptions* format_options = NULLPTR; + + /// \brief Utility method to get a selection representing all columns in a dataset + static std::vector AllColumns(const Schema& dataset_schema); + + /// \brief Utility method to add fields needed for the current filter + /// + /// This method adds any fields that are needed by `filter` which are not already + /// included in the list of columns. Any new fields added will be added to the end + /// in no particular order. + static Status AddFieldsNeededForFilter(ScanV2Options* options); +}; + +/// \brief Describes a projection +struct ARROW_DS_EXPORT ProjectionDescr { + /// \brief The projection expression itself + /// This expression must be a call to make_struct + compute::Expression expression; + /// \brief The output schema of the projection. + + /// This can be calculated from the input schema and the expression but it + /// is cached here for convenience. + std::shared_ptr schema; + + /// \brief Create a ProjectionDescr by binding an expression to the dataset schema + /// + /// expression must return a struct type + static Result FromStructExpression( + const compute::Expression& expression, const Schema& dataset_schema); + + /// \brief Create a ProjectionDescr from expressions/names for each field + static Result FromExpressions(std::vector exprs, + std::vector names, + const Schema& dataset_schema); + + /// \brief Create a default projection referencing fields in the dataset schema + static Result FromNames(std::vector names, + const Schema& dataset_schema, + bool add_augmented_fields = true); + + /// \brief Make a projection that projects every field in the dataset schema + static Result Default(const Schema& dataset_schema, + bool add_augmented_fields = true); +}; + +/// \brief Utility method to set the projection expression and schema +ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr projection); + +/// \brief Combines a record batch with the fragment that the record batch originated +/// from +/// +/// Knowing the source fragment can be useful for debugging & understanding loaded +/// data +struct TaggedRecordBatch { + std::shared_ptr record_batch; + std::shared_ptr fragment; +}; +using TaggedRecordBatchGenerator = std::function()>; +using TaggedRecordBatchIterator = Iterator; + +/// \brief Combines a tagged batch with positional information +/// +/// This is returned when scanning batches in an unordered fashion. This information is +/// needed if you ever want to reassemble the batches in order +struct EnumeratedRecordBatch { + Enumerated> record_batch; + Enumerated> fragment; +}; +using EnumeratedRecordBatchGenerator = std::function()>; +using EnumeratedRecordBatchIterator = Iterator; + +/// @} + +} // namespace dataset + +template <> +struct IterationTraits { + static dataset::TaggedRecordBatch End() { + return dataset::TaggedRecordBatch{NULLPTR, NULLPTR}; + } + static bool IsEnd(const dataset::TaggedRecordBatch& val) { + return val.record_batch == NULLPTR; + } +}; + +template <> +struct IterationTraits { + static dataset::EnumeratedRecordBatch End() { + return dataset::EnumeratedRecordBatch{ + IterationEnd>>(), + IterationEnd>>()}; + } + static bool IsEnd(const dataset::EnumeratedRecordBatch& val) { + return IsIterationEnd(val.fragment); + } +}; + +namespace dataset { + +/// \defgroup dataset-scanning Scanning API +/// +/// @{ + +/// \brief A scanner glues together several dataset classes to load in data. +/// The dataset contains a collection of fragments and partitioning rules. +/// +/// The fragments identify independently loadable units of data (i.e. each fragment has +/// a potentially unique schema and possibly even format. It should be possible to read +/// fragments in parallel if desired). +/// +/// The fragment's format contains the logic necessary to actually create a task to load +/// the fragment into memory. That task may or may not support parallel execution of +/// its own. +/// +/// The scanner is then responsible for creating scan tasks from every fragment in the +/// dataset and (potentially) sequencing the loaded record batches together. +/// +/// The scanner should not buffer the entire dataset in memory (unless asked) instead +/// yielding record batches as soon as they are ready to scan. Various readahead +/// properties control how much data is allowed to be scanned before pausing to let a +/// slow consumer catchup. +/// +/// Today the scanner also handles projection & filtering although that may change in +/// the future. +class ARROW_DS_EXPORT Scanner { + public: + virtual ~Scanner() = default; + + /// \brief Apply a visitor to each RecordBatch as it is scanned. If multiple threads + /// are used (via use_threads), the visitor will be invoked from those threads and is + /// responsible for any synchronization. + virtual Status Scan(std::function visitor) = 0; + /// \brief Convert a Scanner into a Table. + /// + /// Use this convenience utility with care. This will serially materialize the + /// Scan result in memory before creating the Table. + virtual Result> ToTable() = 0; + /// \brief Scan the dataset into a stream of record batches. Each batch is tagged + /// with the fragment it originated from. The batches will arrive in order. The + /// order of fragments is determined by the dataset. + /// + /// Note: The scanner will perform some readahead but will avoid materializing too + /// much in memory (this is goverended by the readahead options and use_threads option). + /// If the readahead queue fills up then I/O will pause until the calling thread catches + /// up. + virtual Result ScanBatches() = 0; + virtual Result ScanBatchesAsync() = 0; + virtual Result ScanBatchesAsync( + ::arrow::internal::Executor* cpu_thread_pool) = 0; + /// \brief Scan the dataset into a stream of record batches. Unlike ScanBatches this + /// method may allow record batches to be returned out of order. This allows for more + /// efficient scanning: some fragments may be accessed more quickly than others (e.g. + /// may be cached in RAM or just happen to get scheduled earlier by the I/O) + /// + /// To make up for the out-of-order iteration each batch is further tagged with + /// positional information. + virtual Result ScanBatchesUnordered() = 0; + virtual Result ScanBatchesUnorderedAsync() = 0; + virtual Result ScanBatchesUnorderedAsync( + ::arrow::internal::Executor* cpu_thread_pool) = 0; + /// \brief A convenience to synchronously load the given rows by index. + /// + /// Will only consume as many batches as needed from ScanBatches(). + virtual Result> TakeRows(const Array& indices) = 0; + /// \brief Get the first N rows. + virtual Result> Head(int64_t num_rows) = 0; + /// \brief Count rows matching a predicate. + /// + /// This method will push down the predicate and compute the result based on fragment + /// metadata if possible. + virtual Result CountRows() = 0; + virtual Future CountRowsAsync() = 0; + /// \brief Convert the Scanner to a RecordBatchReader so it can be + /// easily used with APIs that expect a reader. + virtual Result> ToRecordBatchReader() = 0; + + /// \brief Get the options for this scan. + const std::shared_ptr& options() const { return scan_options_; } + /// \brief Get the dataset that this scanner will scan + virtual const std::shared_ptr& dataset() const = 0; + + protected: + explicit Scanner(std::shared_ptr scan_options) + : scan_options_(std::move(scan_options)) {} + + Result AddPositioningToInOrderScan( + TaggedRecordBatchIterator scan); + + const std::shared_ptr scan_options_; +}; + +/// \brief ScannerBuilder is a factory class to construct a Scanner. It is used +/// to pass information, notably a potential filter expression and a subset of +/// columns to materialize. +class ARROW_DS_EXPORT ScannerBuilder { + public: + explicit ScannerBuilder(std::shared_ptr dataset); + + ScannerBuilder(std::shared_ptr dataset, + std::shared_ptr scan_options); + + ScannerBuilder(std::shared_ptr schema, std::shared_ptr fragment, + std::shared_ptr scan_options); + + /// \brief Make a scanner from a record batch reader. + /// + /// The resulting scanner can be scanned only once. This is intended + /// to support writing data from streaming sources or other sources + /// that can be iterated only once. + static std::shared_ptr FromRecordBatchReader( + std::shared_ptr reader); + + /// \brief Set the subset of columns to materialize. + /// + /// Columns which are not referenced may not be read from fragments. + /// + /// \param[in] columns list of columns to project. Order and duplicates will + /// be preserved. + /// + /// \return Failure if any column name does not exists in the dataset's + /// Schema. + Status Project(std::vector columns); + + /// \brief Set expressions which will be evaluated to produce the materialized + /// columns. + /// + /// Columns which are not referenced may not be read from fragments. + /// + /// \param[in] exprs expressions to evaluate to produce columns. + /// \param[in] names list of names for the resulting columns. + /// + /// \return Failure if any referenced column does not exists in the dataset's + /// Schema. + Status Project(std::vector exprs, std::vector names); + + /// \brief Set the filter expression to return only rows matching the filter. + /// + /// The predicate will be passed down to Sources and corresponding + /// Fragments to exploit predicate pushdown if possible using + /// partition information or Fragment internal metadata, e.g. Parquet statistics. + /// Columns which are not referenced may not be read from fragments. + /// + /// \param[in] filter expression to filter rows with. + /// + /// \return Failure if any referenced columns does not exist in the dataset's + /// Schema. + Status Filter(const compute::Expression& filter); + + /// \brief Indicate if the Scanner should make use of the available + /// ThreadPool found in ScanOptions; + Status UseThreads(bool use_threads = true); + + /// \brief Set the maximum number of rows per RecordBatch. + /// + /// \param[in] batch_size the maximum number of rows. + /// \returns An error if the number for batch is not greater than 0. + /// + /// This option provides a control limiting the memory owned by any RecordBatch. + Status BatchSize(int64_t batch_size); + + /// \brief Set the number of batches to read ahead within a fragment. + /// + /// \param[in] batch_readahead How many batches to read ahead within a fragment + /// \returns an error if this number is less than 0. + /// + /// This option provides a control on the RAM vs I/O tradeoff. + /// It might not be supported by all file formats, in which case it will + /// simply be ignored. + Status BatchReadahead(int32_t batch_readahead); + + /// \brief Set the number of fragments to read ahead + /// + /// \param[in] fragment_readahead How many fragments to read ahead + /// \returns an error if this number is less than 0. + /// + /// This option provides a control on the RAM vs I/O tradeoff. + Status FragmentReadahead(int32_t fragment_readahead); + + /// \brief Set the pool from which materialized and scanned arrays will be allocated. + Status Pool(MemoryPool* pool); + + /// \brief Set fragment-specific scan options. + Status FragmentScanOptions(std::shared_ptr fragment_scan_options); + + /// \brief Override default backpressure configuration + Status Backpressure(acero::BackpressureOptions backpressure); + + /// \brief Return the current scan options for the builder. + Result> GetScanOptions(); + + /// \brief Return the constructed now-immutable Scanner object + Result> Finish(); + + const std::shared_ptr& schema() const; + const std::shared_ptr& projected_schema() const; + + private: + std::shared_ptr dataset_; + std::shared_ptr scan_options_ = std::make_shared(); +}; + +/// \brief Construct a source ExecNode which yields batches from a dataset scan. +/// +/// Does not construct associated filter or project nodes. +/// Yielded batches will be augmented with fragment/batch indices to enable stable +/// ordering for simple ExecPlans. +class ARROW_DS_EXPORT ScanNodeOptions : public acero::ExecNodeOptions { + public: + explicit ScanNodeOptions(std::shared_ptr dataset, + std::shared_ptr scan_options, + bool require_sequenced_output = false) + : dataset(std::move(dataset)), + scan_options(std::move(scan_options)), + require_sequenced_output(require_sequenced_output) {} + + std::shared_ptr dataset; + std::shared_ptr scan_options; + bool require_sequenced_output; +}; + +/// @} + +namespace internal { +ARROW_DS_EXPORT void InitializeScanner(arrow::acero::ExecFactoryRegistry* registry); +ARROW_DS_EXPORT void InitializeScannerV2(arrow::acero::ExecFactoryRegistry* registry); +} // namespace internal +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/type_fwd.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..d58781e038de9ffc2686ebfda9f640eeacdd6668 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/type_fwd.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include + +#include "arrow/compute/type_fwd.h" // IWYU pragma: export +#include "arrow/dataset/visibility.h" +#include "arrow/filesystem/type_fwd.h" // IWYU pragma: export +#include "arrow/type_fwd.h" // IWYU pragma: export + +namespace arrow { +namespace dataset { + +class Dataset; +class DatasetFactory; +using DatasetVector = std::vector>; + +class UnionDataset; +class UnionDatasetFactory; + +class Fragment; +using FragmentIterator = Iterator>; +using FragmentVector = std::vector>; + +class FragmentScanOptions; + +class FileSource; +class FileFormat; +class FileFragment; +class FileWriter; +class FileWriteOptions; +class FileSystemDataset; +class FileSystemDatasetFactory; +struct FileSystemDatasetWriteOptions; +class WriteNodeOptions; + +/// \brief Controls what happens if files exist in an output directory during a dataset +/// write +enum class ExistingDataBehavior : int8_t { + /// Deletes all files in a directory the first time that directory is encountered + kDeleteMatchingPartitions, + /// Ignores existing files, overwriting any that happen to have the same name as an + /// output file + kOverwriteOrIgnore, + /// Returns an error if there are any files or subdirectories in the output directory + kError, +}; + +class InMemoryDataset; + +class CsvFileFormat; +class CsvFileWriter; +class CsvFileWriteOptions; +struct CsvFragmentScanOptions; + +class JsonFileFormat; +class JsonFileWriter; +class JsonFileWriteOptions; +struct JsonFragmentScanOptions; + +class IpcFileFormat; +class IpcFileWriter; +class IpcFileWriteOptions; +class IpcFragmentScanOptions; + +class ParquetFileFormat; +class ParquetFileFragment; +class ParquetFragmentScanOptions; +class ParquetFileWriter; +class ParquetFileWriteOptions; + +class Partitioning; +class PartitioningFactory; +class PartitioningOrFactory; +struct KeyValuePartitioningOptions; +class DirectoryPartitioning; +class HivePartitioning; +struct HivePartitioningOptions; +class FilenamePartitioning; +struct FilenamePartitioningOptions; + +class ScanNodeOptions; +struct ScanOptions; + +class Scanner; + +class ScannerBuilder; + +class ScanTask; +using ScanTaskVector = std::vector>; +using ScanTaskIterator = Iterator>; + +} // namespace dataset +} // namespace arrow diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/visibility.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/visibility.h new file mode 100644 index 0000000000000000000000000000000000000000..752907238ca071238e21a303a947afbc1f11217f --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/visibility.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4251) +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif + +# ifdef ARROW_DS_STATIC +# define ARROW_DS_EXPORT +# elif defined(ARROW_DS_EXPORTING) +# define ARROW_DS_EXPORT __declspec(dllexport) +# else +# define ARROW_DS_EXPORT __declspec(dllimport) +# endif + +# define ARROW_DS_NO_EXPORT +#else // Not Windows +# ifndef ARROW_DS_EXPORT +# define ARROW_DS_EXPORT __attribute__((visibility("default"))) +# endif +# ifndef ARROW_DS_NO_EXPORT +# define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden"))) +# endif +#endif // Non-Windows + +#if defined(_MSC_VER) +# pragma warning(pop) +#endif diff --git a/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/datum.h b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/datum.h new file mode 100644 index 0000000000000000000000000000000000000000..4a88e7a81125cbed89d78d0e67288075ed9295f8 --- /dev/null +++ b/graphrag-ollama/lib/python3.12/site-packages/pyarrow/include/arrow/datum.h @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/device_allocation_type_set.h" +#include "arrow/scalar.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class ChunkedArray; +class RecordBatch; +class Table; + +/// \class Datum +/// \brief Variant type for various Arrow C++ data structures +struct ARROW_EXPORT Datum { + /// \brief The kind of datum stored + enum Kind { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE }; + + /// \brief A placeholder type to represent empty datum + struct Empty {}; + + /// \brief Datums variants may have a length. This special value indicate that the + /// current variant does not have a length. + static constexpr int64_t kUnknownLength = -1; + + /// \brief Storage of the actual datum. + /// + /// Note: For arrays, ArrayData is stored instead of Array for easier processing + std::variant, std::shared_ptr, + std::shared_ptr, std::shared_ptr, + std::shared_ptr
> + value; + + /// \brief Empty datum, to be populated elsewhere + Datum() = default; + + Datum(const Datum& other) = default; + Datum& operator=(const Datum& other) = default; + Datum(Datum&& other) = default; + Datum& operator=(Datum&& other) = default; + + /// \brief Construct from a Scalar + Datum(std::shared_ptr value) // NOLINT implicit conversion + : value(std::move(value)) {} + + /// \brief Construct from an ArrayData + Datum(std::shared_ptr value) // NOLINT implicit conversion + : value(std::move(value)) {} + + /// \brief Construct from an ArrayData + Datum(ArrayData arg) // NOLINT implicit conversion + : value(std::make_shared(std::move(arg))) {} + + /// \brief Construct from an Array + Datum(const Array& value); // NOLINT implicit conversion + + /// \brief Construct from an Array + Datum(const std::shared_ptr& value); // NOLINT implicit conversion + + /// \brief Construct from a ChunkedArray + Datum(std::shared_ptr value); // NOLINT implicit conversion + + /// \brief Construct from a RecordBatch + Datum(std::shared_ptr value); // NOLINT implicit conversion + + /// \brief Construct from a Table + Datum(std::shared_ptr
value); // NOLINT implicit conversion + + /// \brief Construct from a ChunkedArray. + /// + /// This can be expensive, prefer the shared_ptr constructor + explicit Datum(const ChunkedArray& value); + + /// \brief Construct from a RecordBatch. + /// + /// This can be expensive, prefer the shared_ptr constructor + explicit Datum(const RecordBatch& value); + + /// \brief Construct from a Table. + /// + /// This can be expensive, prefer the shared_ptr
constructor + explicit Datum(const Table& value); + + /// \brief Cast from concrete subtypes of Array or Scalar to Datum + template , + bool IsScalar = std::is_base_of_v, + typename = enable_if_t> + Datum(std::shared_ptr value) // NOLINT implicit conversion + : Datum(std::shared_ptr::type>( + std::move(value))) {} + + /// \brief Cast from concrete subtypes of Array or Scalar to Datum + template , + bool IsArray = std::is_base_of_v, + bool IsScalar = std::is_base_of_v, + typename = enable_if_t> + Datum(T&& value) // NOLINT implicit conversion + : Datum(std::make_shared(std::forward(value))) {} + + /// \brief Copy from concrete subtypes of Scalar. + /// + /// The concrete scalar type must be copyable (not all of them are). + template >> + Datum(const T& value) // NOLINT implicit conversion + : Datum(std::make_shared(value)) {} + + // Convenience constructors + /// \brief Convenience constructor storing a bool scalar. + explicit Datum(bool value); + /// \brief Convenience constructor storing an int8 scalar. + explicit Datum(int8_t value); + /// \brief Convenience constructor storing a uint8 scalar. + explicit Datum(uint8_t value); + /// \brief Convenience constructor storing an int16 scalar. + explicit Datum(int16_t value); + /// \brief Convenience constructor storing a uint16 scalar. + explicit Datum(uint16_t value); + /// \brief Convenience constructor storing an int32 scalar. + explicit Datum(int32_t value); + /// \brief Convenience constructor storing a uint32 scalar. + explicit Datum(uint32_t value); + /// \brief Convenience constructor storing an int64 scalar. + explicit Datum(int64_t value); + /// \brief Convenience constructor storing a uint64 scalar. + explicit Datum(uint64_t value); + /// \brief Convenience constructor storing a float scalar. + explicit Datum(float value); + /// \brief Convenience constructor storing a double scalar. + explicit Datum(double value); + /// \brief Convenience constructor storing a string scalar. + explicit Datum(std::string value); + /// \brief Convenience constructor storing a string scalar. + explicit Datum(const char* value); + + /// \brief Convenience constructor for a DurationScalar from std::chrono::duration + template