From b9365a3a56080c1da780de6ad0b84735dda993be Mon Sep 17 00:00:00 2001
From: Anirban Ray <39331844+yarnabrina@users.noreply.github.com>
Date: Sun, 31 Dec 2023 16:15:00 +0530
Subject: [PATCH] Release v0.0.1
---
.flake8 | 5 +
.gitattributes | 131 +
.gitignore | 279 +
.pre-commit-config.yaml | 174 +
LICENSE | 21 +
README.md | 1 +
pyproject.toml | 387 +
requirements/constraints.fine_tuning.txt | 8 +
requirements/constraints.txt | 11 +
requirements/requirements.fine_tuning.txt | 8 +
requirements/requirements.txt | 11 +
src/cli.py | 88 +
src/generative_ai/__init__.py | 23 +
.../dataset_generation/__init__.py | 32 +
.../orchestrate_generation.py | 106 +
.../dataset_generation/step_1_generation.py | 291 +
.../dataset_generation/step_2_generation.py | 2787 ++++++
.../dataset_generation/utils_generation.py | 211 +
.../fine_tuning/step_1_tuning.ipynb | 7589 +++++++++++++++++
.../fine_tuning/step_2_tuning.ipynb | 756 ++
.../information_retrieval/__init__.py | 53 +
.../orchestrate_retrieval.py | 127 +
.../information_retrieval/step_1_retrieval.py | 45 +
.../information_retrieval/step_2_retrieval.py | 105 +
.../information_retrieval/step_3_retrieval.py | 53 +
.../information_retrieval/utils_retrieval.py | 57 +
src/generative_ai/metadata.json | 21 +
src/generative_ai/py.typed | 0
src/generative_ai/top_level.py | 130 +
src/generative_ai/utils_top_level.py | 12 +
src/gui.py | 303 +
31 files changed, 13825 insertions(+)
create mode 100644 .flake8
create mode 100644 .gitattributes
create mode 100644 .gitignore
create mode 100644 .pre-commit-config.yaml
create mode 100644 LICENSE
create mode 100644 README.md
create mode 100644 pyproject.toml
create mode 100644 requirements/constraints.fine_tuning.txt
create mode 100644 requirements/constraints.txt
create mode 100644 requirements/requirements.fine_tuning.txt
create mode 100644 requirements/requirements.txt
create mode 100644 src/cli.py
create mode 100644 src/generative_ai/__init__.py
create mode 100644 src/generative_ai/dataset_generation/__init__.py
create mode 100644 src/generative_ai/dataset_generation/orchestrate_generation.py
create mode 100644 src/generative_ai/dataset_generation/step_1_generation.py
create mode 100644 src/generative_ai/dataset_generation/step_2_generation.py
create mode 100644 src/generative_ai/dataset_generation/utils_generation.py
create mode 100644 src/generative_ai/fine_tuning/step_1_tuning.ipynb
create mode 100644 src/generative_ai/fine_tuning/step_2_tuning.ipynb
create mode 100644 src/generative_ai/information_retrieval/__init__.py
create mode 100644 src/generative_ai/information_retrieval/orchestrate_retrieval.py
create mode 100644 src/generative_ai/information_retrieval/step_1_retrieval.py
create mode 100644 src/generative_ai/information_retrieval/step_2_retrieval.py
create mode 100644 src/generative_ai/information_retrieval/step_3_retrieval.py
create mode 100644 src/generative_ai/information_retrieval/utils_retrieval.py
create mode 100644 src/generative_ai/metadata.json
create mode 100644 src/generative_ai/py.typed
create mode 100644 src/generative_ai/top_level.py
create mode 100644 src/generative_ai/utils_top_level.py
create mode 100644 src/gui.py
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..e7ad425
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+extend-ignore = E203
+per-file-ignores =
+ __init__.py: F401
+max-line-length = 99
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..ab9328b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,131 @@
+## Reference: https://github.com/alexkaratarakis/gitattributes/blob/7beed92a802062af247243d6c06a65fbbc7a35de/Common.gitattributes
+
+# Common settings that generally should always be used with your language specific settings
+
+# Auto detect text files and perform LF normalization
+* text=auto
+
+#
+# The above will handle all files NOT found below
+#
+
+# Documents
+*.bibtex text diff=bibtex
+*.doc diff=astextplain
+*.DOC diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot diff=astextplain
+*.DOT diff=astextplain
+*.pdf diff=astextplain
+*.PDF diff=astextplain
+*.rtf diff=astextplain
+*.RTF diff=astextplain
+*.md text diff=markdown
+*.mdx text diff=markdown
+*.tex text diff=tex
+*.adoc text
+*.textile text
+*.mustache text
+*.csv text eol=crlf
+*.tab text
+*.tsv text
+*.txt text
+*.sql text
+*.epub diff=astextplain
+
+# Graphics
+*.png binary
+*.jpg binary
+*.jpeg binary
+*.gif binary
+*.tif binary
+*.tiff binary
+*.ico binary
+# SVG treated as text by default.
+*.svg text
+# If you want to treat it as binary,
+# use the following line instead.
+# *.svg binary
+*.eps binary
+
+# Scripts
+*.bash text eol=lf
+*.fish text eol=lf
+*.sh text eol=lf
+*.zsh text eol=lf
+# These are explicitly windows files and should use crlf
+*.bat text eol=crlf
+*.cmd text eol=crlf
+*.ps1 text eol=crlf
+
+# Serialisation
+*.json text
+*.toml text
+*.xml text
+*.yaml text
+*.yml text
+
+# Archives
+*.7z binary
+*.gz binary
+*.tar binary
+*.tgz binary
+*.zip binary
+
+# Text files where line endings should be preserved
+*.patch -text
+
+#
+# Exclude files from exporting
+#
+
+.gitattributes export-ignore
+.gitignore export-ignore
+.gitkeep export-ignore
+
+## Reference: https://github.com/alexkaratarakis/gitattributes/blob/7beed92a802062af247243d6c06a65fbbc7a35de/Python.gitattributes
+
+# Basic .gitattributes for a python repo.
+
+# Source files
+# ============
+*.pxd text diff=python
+*.py text diff=python
+*.py3 text diff=python
+*.pyw text diff=python
+*.pyx text diff=python
+*.pyz text diff=python
+*.pyi text diff=python
+
+# Binary files
+# ============
+*.db binary
+*.p binary
+*.pkl binary
+*.pickle binary
+*.pyc binary export-ignore
+*.pyo binary export-ignore
+*.pyd binary
+
+# Jupyter notebook
+*.ipynb text eol=lf
+
+# Note: .db, .p, and .pkl files are associated
+# with the python modules ``pickle``, ``dbm.*``,
+# ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
+# (among others).
+
+## Reference: https://github.com/alexkaratarakis/gitattributes/blob/7beed92a802062af247243d6c06a65fbbc7a35de/Markdown.gitattributes
+
+# Apply override to all files in the directory
+*.md linguist-detectable
+
+## Custom
+
+*.gitattributes text linguist-detectable linguist-language=gitattributes
+*.gitignore text linguist-detectable linguist-language=gitignore
+
+*.py linguist-detectable
+*.toml diff=toml linguist-detectable
+*.yaml diff=yaml linguist-detectable
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e9175f6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,279 @@
+## Reference: https://github.com/github/gitignore/blob/4488915eec0b3a45b5c63ead28f286819c0917de/Global/Linux.gitignore
+
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+## Reference: https://github.com/github/gitignore/blob/4488915eec0b3a45b5c63ead28f286819c0917de/Global/macOS.gitignore
+
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+## Reference: https://github.com/github/gitignore/blob/4488915eec0b3a45b5c63ead28f286819c0917de/Global/Windows.gitignore
+
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+## Reference: https://github.com/github/gitignore/blob/4488915eec0b3a45b5c63ead28f286819c0917de/Global/Vim.gitignore
+
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+
+## Reference: https://github.com/github/gitignore/blob/4488915eec0b3a45b5c63ead28f286819c0917de/Global/VisualStudioCode.gitignore
+
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+## Reference: https://github.com/github/gitignore/blob/4488915eec0b3a45b5c63ead28f286819c0917de/Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+## Custom
+
+coverage_data
+coverage_html_report/
+coverage_xml_report.xml
+pytest_junit_report.xml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..528ff65
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,174 @@
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.5.0
+ hooks:
+ - id: check-ast
+ - id: check-case-conflict
+ - id: check-executables-have-shebangs
+ - id: check-json
+ - id: check-merge-conflict
+ - id: check-shebang-scripts-are-executable
+ - id: check-symlinks
+ - id: check-toml
+ - id: check-yaml
+ args:
+ - --allow-multiple-documents
+ - id: detect-private-key
+ - id: end-of-file-fixer
+ - id: mixed-line-ending
+ - id: name-tests-test
+ args:
+ - --pytest-test-first
+ - id: no-commit-to-branch
+ - id: pretty-format-json
+ args:
+ - --autofix
+ - --indent
+ - "4"
+ - id: requirements-txt-fixer
+ - id: trailing-whitespace
+ - repo: https://github.com/asottile/pyupgrade
+ rev: v3.15.0
+ hooks:
+ - id: pyupgrade
+ args:
+ - --keep-runtime-typing
+ - --py311-plus
+ - repo: https://github.com/pycqa/autoflake
+ rev: v2.2.1
+ hooks:
+ - id: autoflake
+ args:
+ - src
+ pass_filenames: false
+ - repo: https://github.com/pycqa/isort
+ rev: 5.13.2
+ hooks:
+ - id: isort
+ args:
+ - src
+ pass_filenames: false
+ - repo: https://github.com/psf/black
+ rev: 23.12.1
+ hooks:
+ - id: black-jupyter
+ args:
+ - src
+ pass_filenames: false
+ - repo: https://github.com/pycqa/bandit
+ rev: 1.7.6
+ hooks:
+ - id: bandit
+ args:
+ - --recursive
+ - --severity-level
+ - high
+ - --confidence-level
+ - high
+ - src
+ pass_filenames: false
+ - repo: https://github.com/pycqa/flake8
+ rev: 6.1.0
+ hooks:
+ - id: flake8
+ args:
+ - src
+ pass_filenames: false
+ # - repo: https://github.com/pre-commit/mirrors-mypy
+ # rev: v1.7.1
+ # hooks:
+ # - id: mypy
+ # additional_dependencies:
+ # - pydantic
+ # args:
+ # - --ignore-missing-imports
+ # - --scripts-are-modules
+ # pass_filenames: false
+ # stages:
+ # - manual
+ - repo: https://github.com/PyCQA/pylint
+ rev: v3.0.3
+ hooks:
+ - id: pylint
+ args:
+ - --disable
+ - import-error
+ - src
+ pass_filenames: false
+ stages:
+ - manual
+ # - repo: https://github.com/RobertCraigie/pyright-python
+ # rev: v1.1.337
+ # hooks:
+ # - id: pyright
+ # pass_filenames: false
+ # stages:
+ # - manual
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.1.9
+ hooks:
+ - id: ruff
+ args:
+ - src
+ pass_filenames: false
+ - repo: https://github.com/jendrikseipp/vulture
+ rev: v2.10
+ hooks:
+ - id: vulture
+ pass_filenames: false
+ # - repo: https://github.com/PyCQA/docformatter
+ # rev: v1.7.5
+ # hooks:
+ # - id: docformatter
+ # additional_dependencies:
+ # - tomli
+ # args:
+ # - --in-place
+ # - src
+ # pass_filenames: false
+ # - repo: https://github.com/adamchainz/blacken-docs
+ # rev: 1.16.0
+ # hooks:
+ # - id: blacken-docs
+ # args:
+ # - --line-length
+ # - "87"
+ # - --target-version
+ # - py311
+ # - repo: https://github.com/econchick/interrogate
+ # rev: 1.5.0
+ # hooks:
+ # - id: interrogate
+ # args:
+ # - src
+ # pass_filenames: false
+ # - repo: https://github.com/pycqa/pydocstyle
+ # rev: 6.3.0
+ # hooks:
+ # - id: pydocstyle
+ # additional_dependencies:
+ # - tomli
+ # args:
+ # - src
+ # pass_filenames: false
+ - repo: https://github.com/tox-dev/pyproject-fmt
+ rev: 1.5.3
+ hooks:
+ - id: pyproject-fmt
+ - repo: https://github.com/abravalheri/validate-pyproject
+ rev: v0.15
+ hooks:
+ - id: validate-pyproject
+ - repo: https://github.com/codespell-project/codespell
+ rev: v2.2.6
+ hooks:
+ - id: codespell
+ additional_dependencies:
+ - tomli
+ args:
+ - --write-changes
+ stages:
+ - manual
+default_language_version:
+ python: python3.11
+fail_fast: false
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..985bd07
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Anirban Ray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..10ffb23
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+# Query Package Documentation
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..99cdfe8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,387 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = [
+ "setuptools>=65.5.1",
+]
+
+[project]
+name = "query-package-documentation"
+version = "0.0.1"
+description = "A package to explore documentations"
+keywords = [
+ "documentation",
+ "generative-ai",
+]
+license = { file = "LICENSE" }
+maintainers = [
+ { name = "Anirban Ray", email = "39331844+yarnabrina@users.noreply.github.com" },
+]
+authors = [
+ { name = "Anirban Ray", email = "39331844+yarnabrina@users.noreply.github.com" },
+]
+requires-python = "==3.11.*"
+classifiers = [
+ "Development Status :: 3 - Alpha",
+ "Framework :: Flake8",
+ "Framework :: Pydantic",
+ "Framework :: Pytest",
+ "Framework :: Sphinx",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved",
+ "License :: OSI Approved :: MIT License",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: 3.11",
+ "Topic :: Software Development",
+ "Topic :: Software Development :: Build Tools",
+ "Topic :: Software Development :: Libraries",
+ "Topic :: Utilities",
+ "Typing :: Typed",
+]
+dynamic=[
+ "readme",
+]
+dependencies = [
+ "chromadb<0.5,>=0.4.15",
+ "ctransformers<0.3,>=0.2.27",
+ "gradio<4.13,>=4.12",
+ "jq<1.7,>=1.6",
+ "langchain==0.0.353",
+ "numpydoc<1.7,>=1.6",
+ "pydantic<2.6,>=2.4.2",
+ "sentence-transformers<2.3,>=2.2.2",
+ "transformers<4.37,>=4.35",
+ "typer<0.10,>=0.9",
+ "typing-extensions<4.10,>=4.9",
+]
+[project.optional-dependencies]
+all = [
+ "autoflake",
+ "bandit",
+ "black",
+ "blacken-docs",
+ "build",
+ "codespell",
+ "coverage[toml]",
+ "docformatter[tomli]",
+ "flake8",
+ "furo",
+ "hypothesis[pytest]",
+ "interrogate",
+ "isort",
+ "mypy",
+ "nox",
+ "pre-commit",
+ "pydocstyle[toml]",
+ "pylint",
+ "pyproject-fmt",
+ "pyright",
+ "pytest",
+ "pyupgrade",
+ "Sphinx",
+ "sphinx-copybutton",
+ "twine",
+ "validate-pyproject",
+ "vulture",
+]
+dev = [
+ "codespell",
+ "nox",
+ "pre-commit",
+]
+doc = [
+ "furo",
+ "Sphinx",
+ "sphinx-copybutton",
+]
+fine-tuning = [
+ "accelerate<0.26,>=0.24.1",
+ "bitsandbytes<0.42,>=0.41.2",
+ "datasets<2.17,>=2.15",
+ "peft<0.8,>=0.6.2",
+ "safetensors<0.5,>=0.4",
+ "torch<2.2,>=2.1.1",
+ "transformers<4.37,>=4.35.2",
+ "trl<0.8,>=0.7.4",
+]
+format = [
+ "autoflake",
+ "black",
+ "blacken-docs",
+ "docformatter[tomli]",
+ "isort",
+ "pyproject-fmt",
+ "pyupgrade",
+]
+lint = [
+ "bandit",
+ "flake8",
+ "interrogate",
+ "mypy",
+ "pydocstyle[toml]",
+ "pylint",
+ "pyright",
+ "validate-pyproject",
+ "vulture",
+]
+release = [
+ "build",
+ "twine",
+]
+test = [
+ "coverage[toml]",
+ "hypothesis[pytest]",
+ "pytest",
+]
+[project.urls]
+"Bug Tracker" = "https://github.com/yarnabrina/query-package-documentation/issues"
+"Documentation" = "https://query-package-documentation.readthedocs.io"
+"Source Code" = "https://github.com/yarnabrina/query-package-documentation"
+[project.scripts]
+docs-cli = "cli:CLI_APPLICATION"
+[project.gui-scripts]
+docs-gui = "gui:main"
+
+[tool.setuptools]
+py-modules = [
+ "cli",
+ "gui",
+]
+
+[tool.setuptools.dynamic]
+readme = { file = "README.md", content-type = "text/markdown" }
+
+[tool.setuptools.packages.find]
+where = [
+ "src",
+]
+include = [
+ "generative_ai*",
+]
+exclude = [
+ "*tests*",
+]
+namespaces = false
+
+[tool.setuptools.package-data]
+"generative_ai" = [
+ "metadata.json",
+ "py.typed",
+]
+
+[tool.setuptools.exclude-package-data]
+"*" = [
+ ".gitattributes",
+ ".gitignore",
+]
+
+[tool.black]
+line-length = 99
+target-version = [
+ "py311",
+]
+safe = true
+
+[tool.ruff]
+fix = true
+ignore = [
+ "COM",
+ "D",
+ "D203",
+ "D213",
+ "DTZ",
+ "EM",
+ "FBT",
+ "FIX",
+ "G",
+ "ICN",
+ "PD",
+ "RET501",
+ "RET502",
+ "RET503",
+ "RET504",
+ "SLF",
+ "TRY003",
+]
+ignore-init-module-imports = true
+line-length = 99
+output-format = "grouped"
+select = [
+ "ALL",
+]
+src = [
+ "src",
+]
+target-version = "py311"
+
+[tool.ruff.flake8-annotations]
+allow-star-arg-any = true
+ignore-fully-untyped = true
+
+[tool.ruff.flake8-bugbear]
+extend-immutable-calls = [
+ "fastapi.Depends",
+ "fastapi.Query",
+ "pydantic.Field",
+]
+
+[tool.ruff.flake8-type-checking]
+exempt-modules = [
+ "typing",
+ "typing_extensions",
+]
+runtime-evaluated-base-classes = [
+ "pydantic.BaseModel",
+]
+
+[tool.ruff.pep8-naming]
+classmethod-decorators = [
+ "pydantic.field_validator",
+]
+
+[tool.ruff.per-file-ignores]
+"**/__init__.py" = [
+ "F401",
+]
+"**/test_*.py" = [
+ "S101",
+]
+
+[tool.ruff.pycodestyle]
+max-doc-length = 99
+
+[tool.ruff.pydocstyle]
+convention = "numpy"
+ignore-decorators = [
+ "typing.overload",
+]
+
+[tool.ruff.pyupgrade]
+keep-runtime-typing = true
+
+[tool.isort]
+overwrite_in_place = true
+profile = "black"
+atomic = true
+float_to_top = true
+line_length = 99
+remove_redundant_aliases = true
+src_paths = [
+ "src",
+]
+py_version = 311
+
+[tool.pylint.main]
+extension-pkg-allow-list = [
+ "pydantic",
+]
+fail-under = 8.5
+jobs = 0
+recursive = true
+
+[tool.pylint.basic]
+include-naming-hint = true
+
+[tool.pylint.format]
+max-line-length = 99
+
+[tool.pylint.logging]
+logging-format-style = "new"
+
+[tool.pylint."messages control"]
+enable = [
+ "all",
+]
+disable = [
+ "logging-fstring-interpolation",
+ "missing-class-docstring",
+ "missing-function-docstring",
+ "missing-module-docstring",
+]
+
+[tool.pylint.reports]
+output-format = "colorized"
+
+[tool.docformatter]
+in-place = true
+recursive = true
+wrap-summaries = 99
+wrap-descriptions = 99
+
+[tool.pytest.ini_options]
+addopts = "--junit-xml=pytest_junit_report.xml --doctest-modules --doctest-ignore-import-errors --doctest-continue-on-failure"
+console_output_style = "count"
+
+[tool.coverage.run]
+branch = true
+command_line = "--module pytest"
+data_file = "coverage_data"
+include = [
+ "src/**/*.py",
+]
+omit = [
+ "**/tests/*.py",
+]
+
+[tool.coverage.report]
+fail_under = 85
+include = [
+ "src/**/*.py",
+]
+omit = [
+ "**/tests/*.py",
+]
+precision = 2
+exclude_lines = [
+ "pragma: no cover",
+ "if __name__ == .__main__.:",
+ "if typing.TYPE_CHECKING:",
+]
+
+[tool.coverage.html]
+directory = "coverage_html_report"
+
+[tool.coverage.xml]
+output = "coverage_xml_report.xml"
+
+[tool.mypy]
+files = [
+ "src",
+]
+exclude = [
+ "conftest",
+ "test_",
+]
+strict = true
+
+[tool.pyright]
+include = [
+ "src",
+]
+exclude = [
+ "**/tests/*.py",
+]
+pythonVersion = "3.11"
+
+[tool.autoflake]
+in-place = true
+remove-all-unused-imports = true
+recursive = true
+expand-star-imports = true
+ignore-init-module-imports = true
+remove-duplicate-keys = true
+remove-unused-variables = true
+
+[tool.interrogate]
+fail-under = 85
+ignore-init-method = true
+
+[tool.pydocstyle]
+convention = "numpy"
+
+[tool.vulture]
+min_confidence = 100
+paths = [
+ "src",
+]
diff --git a/requirements/constraints.fine_tuning.txt b/requirements/constraints.fine_tuning.txt
new file mode 100644
index 0000000..3370671
--- /dev/null
+++ b/requirements/constraints.fine_tuning.txt
@@ -0,0 +1,8 @@
+accelerate<0.26,>=0.24.1
+bitsandbytes<0.42,>=0.41.2
+datasets<2.17,>=2.15.0
+peft<0.8,>=0.6.2
+safetensors<0.5,>=0.4.0
+torch<2.2,>=2.1.1
+transformers<4.37,>=4.35.2
+trl<0.8,>=0.7.4
diff --git a/requirements/constraints.txt b/requirements/constraints.txt
new file mode 100644
index 0000000..1f4c64d
--- /dev/null
+++ b/requirements/constraints.txt
@@ -0,0 +1,11 @@
+chromadb<0.5,>=0.4.15
+ctransformers<0.3,>=0.2.27
+gradio<4.13,>=4.12
+jq<1.7,>=1.6
+langchain==0.0.353
+numpydoc<1.7,>=1.6
+pydantic<2.6,>=2.4.2
+sentence-transformers<2.3,>=2.2.2
+transformers<4.37,>=4.35
+typer<0.10,>=0.9
+typing-extensions<4.10,>=4.9
diff --git a/requirements/requirements.fine_tuning.txt b/requirements/requirements.fine_tuning.txt
new file mode 100644
index 0000000..50dea65
--- /dev/null
+++ b/requirements/requirements.fine_tuning.txt
@@ -0,0 +1,8 @@
+accelerate
+bitsandbytes
+datasets
+peft
+safetensors
+torch
+transformers
+trl
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
new file mode 100644
index 0000000..a4e2e3a
--- /dev/null
+++ b/requirements/requirements.txt
@@ -0,0 +1,11 @@
+chromadb
+ctransformers
+gradio
+jq
+langchain
+numpydoc
+pydantic
+sentence-transformers
+transformers
+typer
+typing-extensions
diff --git a/src/cli.py b/src/cli.py
new file mode 100644
index 0000000..3221e9d
--- /dev/null
+++ b/src/cli.py
@@ -0,0 +1,88 @@
+import pathlib
+import sys
+
+import typer
+
+from generative_ai.information_retrieval import PipelineType, RetrievalType, TransformerType
+from generative_ai.top_level import create_database, create_dataset, get_response
+
+CLI_APPLICATION = typer.Typer(name="CLI for Generative AI aaplication")
+
+
+@CLI_APPLICATION.command()
+def generate_dataset(
+ package_name: str,
+ dataset_file: pathlib.Path = pathlib.Path("json_documents.json"),
+ force: bool = False,
+) -> None:
+ try:
+ dataset_path = create_dataset(package_name, dataset_file, force)
+ except FileExistsError as error:
+ typer.echo(message=str(error), err=True)
+ sys.exit(1)
+ else:
+ typer.echo(f"Dataset generation complete: '{dataset_path}'.")
+
+
+@CLI_APPLICATION.command()
+def generate_database(
+ dataset_file: pathlib.Path = pathlib.Path("json_documents.json"),
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+ database_directory: pathlib.Path = pathlib.Path("embeddings_database"),
+ force: bool = False,
+) -> None:
+ try:
+ database_path = create_database(dataset_file, embedding_model, database_directory, force)
+ except (FileExistsError, FileNotFoundError) as error:
+ typer.echo(message=str(error), err=True)
+ sys.exit(1)
+ else:
+ typer.echo(f"Database generation complete: '{database_path}'.")
+
+
+@CLI_APPLICATION.command()
+def answer_query( # noqa: PLR0913
+ query: str,
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+ database_directory: pathlib.Path = pathlib.Path("embeddings_database"),
+ search_type: RetrievalType = RetrievalType.MMR,
+ number_of_documents: int = 3,
+ initial_number_of_documents: int = 5,
+ diversity_level: float = 0.5,
+ language_model_type: TransformerType = TransformerType.STANDARD_TRANSFORMERS,
+ standard_pipeline_type: PipelineType = PipelineType.TEXT2TEXT_GENERATION,
+ standard_model_name: str = "google/flan-t5-large",
+ quantised_model_name: str = "TheBloke/zephyr-7B-beta-GGUF",
+ quantised_model_file: str = "zephyr-7b-beta.Q4_K_M.gguf",
+ quantised_model_type: str = "mistral",
+) -> None:
+ try:
+ response = get_response(
+ query,
+ embedding_model,
+ database_directory,
+ search_type,
+ number_of_documents,
+ initial_number_of_documents,
+ diversity_level,
+ language_model_type,
+ standard_pipeline_type,
+ standard_model_name,
+ quantised_model_name,
+ quantised_model_file,
+ quantised_model_type,
+ )
+ except FileNotFoundError as error:
+ typer.echo(message=str(error), err=True)
+ sys.exit(1)
+ else:
+ typer.echo(f"Query: {response.query}")
+ typer.echo(f"Answer: {response.answer}")
+ typer.echo(f"Duration: {response.llm_duration:.2f} seconds")
+
+ for counter, source_document in enumerate(response.source_documents):
+ typer.echo(f"Source {counter + 1}: {source_document}")
+
+
+if __name__ == "__main__":
+ CLI_APPLICATION()
diff --git a/src/generative_ai/__init__.py b/src/generative_ai/__init__.py
new file mode 100644
index 0000000..e2361cb
--- /dev/null
+++ b/src/generative_ai/__init__.py
@@ -0,0 +1,23 @@
+import importlib.resources
+import json
+import typing
+
+
+class PackageMetadata(typing.TypedDict):
+ Name: str
+ Version: str
+ Description: str
+ Keywords: list[str]
+ License: str
+ Maintainers: list[str]
+ Authors: list[str]
+ Links: dict[str, str]
+
+
+METADATA_CONTENTS: str = (
+ importlib.resources.files("generative_ai").joinpath("metadata.json").read_text()
+)
+METADATA: PackageMetadata = json.loads(METADATA_CONTENTS)
+
+__version__: str = METADATA["Version"]
+__all__: list[str] = ["METADATA", "__version__"]
diff --git a/src/generative_ai/dataset_generation/__init__.py b/src/generative_ai/dataset_generation/__init__.py
new file mode 100644
index 0000000..ff9a691
--- /dev/null
+++ b/src/generative_ai/dataset_generation/__init__.py
@@ -0,0 +1,32 @@
+from .orchestrate_generation import (
+ generate_json_dataset,
+ generate_raw_datasets,
+ load_json_dataset,
+ store_json_dataset,
+)
+from .step_1_generation import (
+ get_all_member_details,
+ get_all_module_contents,
+ get_all_package_contents,
+)
+from .step_2_generation import (
+ generate_member_dataset,
+ generate_module_dataset,
+ generate_package_dataset,
+)
+from .utils_generation import JSONDataset, JSONDocument
+
+__all__ = [
+ "JSONDataset",
+ "JSONDocument",
+ "generate_json_dataset",
+ "generate_member_dataset",
+ "generate_module_dataset",
+ "generate_package_dataset",
+ "generate_raw_datasets",
+ "get_all_member_details",
+ "get_all_module_contents",
+ "get_all_package_contents",
+ "load_json_dataset",
+ "store_json_dataset",
+]
diff --git a/src/generative_ai/dataset_generation/orchestrate_generation.py b/src/generative_ai/dataset_generation/orchestrate_generation.py
new file mode 100644
index 0000000..9b098a8
--- /dev/null
+++ b/src/generative_ai/dataset_generation/orchestrate_generation.py
@@ -0,0 +1,106 @@
+import itertools
+import json
+import logging
+import pathlib
+
+import pydantic
+
+from .step_1_generation import (
+ get_all_member_details,
+ get_all_module_contents,
+ get_all_package_contents,
+)
+from .step_2_generation import (
+ generate_member_dataset,
+ generate_module_dataset,
+ generate_package_dataset,
+)
+from .utils_generation import Dataset, JSONDataset, JSONDocument, MemberDetails, Module
+
+LOGGER = logging.getLogger(__name__)
+
+
+@pydantic.validate_call(validate_return=True)
+def generate_raw_datasets(package_name: str) -> list[Dataset]:
+ all_package_contents = get_all_package_contents(package_name)
+ LOGGER.info(f"Enlisted total {len(all_package_contents)} packages recursively.")
+
+ all_module_contents: list[Module] = []
+ for package_contents in all_package_contents:
+ for module in package_contents.children_modules_names:
+ try:
+ module_contents = get_all_module_contents(
+ f"{package_contents.package_qualified_name}.{module}"
+ )
+ except ImportError:
+ LOGGER.warning(f"Failed to import {module=}.")
+
+ continue
+
+ all_module_contents.append(module_contents)
+
+ LOGGER.info(f"Enlisted total {len(all_module_contents)} modules recursively.")
+
+ all_member_details: list[MemberDetails] = []
+ for module_contents in all_module_contents:
+ for member in module_contents.module_members:
+ try:
+ member_details = get_all_member_details(
+ module_contents.module_qualified_name, member.member_name, member.member_object
+ )
+ except (TypeError, ValueError):
+ continue
+
+ all_member_details.append(member_details)
+
+ LOGGER.info(f"Enlisted total {len(all_member_details)} members recursively.")
+
+ package_datasets = map(generate_package_dataset, all_package_contents)
+ module_datasets = map(generate_module_dataset, all_module_contents)
+ member_datasets = map(generate_member_dataset, all_member_details)
+
+ combined_datasets = itertools.chain(package_datasets, module_datasets, *member_datasets)
+
+ return list(combined_datasets)
+
+
+@pydantic.validate_call(validate_return=True)
+def generate_json_dataset(raw_datasets: list[Dataset]) -> JSONDataset:
+ retrieval_documents: list[str] = []
+ tuning_documents: list[JSONDocument] = []
+
+ for dataset in raw_datasets:
+ retrieval_documents.extend(dataset.retrieval_chunks)
+
+ tuning_documents.extend(
+ [
+ JSONDocument.model_validate(document.model_dump())
+ for document in dataset.tuning_documents
+ ]
+ )
+
+ return JSONDataset.model_validate(
+ {"retrieval_documents": retrieval_documents, "tuning_documents": tuning_documents}
+ )
+
+
+@pydantic.validate_call
+def store_json_dataset(json_dataset: JSONDataset, file_path: pathlib.Path) -> None:
+ with pathlib.Path(file_path).open(mode="w", encoding="utf-8") as file_object:
+ json.dump(json_dataset.model_dump(), file_object, indent=4)
+
+
+@pydantic.validate_call(validate_return=True)
+def load_json_dataset(file_path: pathlib.Path) -> JSONDataset:
+ with pathlib.Path(file_path).open(mode="r", encoding="utf-8") as file_object:
+ json_dataset = json.load(file_object)
+
+ return JSONDataset.model_validate(json_dataset)
+
+
+__all__ = [
+ "generate_json_dataset",
+ "generate_raw_datasets",
+ "load_json_dataset",
+ "store_json_dataset",
+]
diff --git a/src/generative_ai/dataset_generation/step_1_generation.py b/src/generative_ai/dataset_generation/step_1_generation.py
new file mode 100644
index 0000000..0cbfc51
--- /dev/null
+++ b/src/generative_ai/dataset_generation/step_1_generation.py
@@ -0,0 +1,291 @@
+import enum
+import importlib
+import importlib.util
+import inspect
+import logging
+import pkgutil
+import types
+import typing
+
+import pydantic
+from numpydoc.docscrape import NumpyDocString
+
+from .utils_generation import (
+ Attribute,
+ ClassDetails,
+ EnumDetails,
+ EnumMember,
+ FunctionDetails,
+ MemberDetails,
+ MemberType,
+ Method,
+ Module,
+ ModuleMember,
+ Package,
+ Parameter,
+ Raises,
+ Returns,
+ Warns,
+)
+
+LOGGER = logging.getLogger(__name__)
+
+
+@pydantic.validate_call(validate_return=True)
+def import_package(package_name: str) -> pydantic.InstanceOf[types.ModuleType]:
+ package_spec = importlib.util.find_spec(package_name)
+
+ if package_spec is None:
+ LOGGER.error(f"spec for {package_name=} could not be found")
+
+ raise ValueError(f"{package_name=} is not found")
+
+ package = importlib.util.module_from_spec(package_spec)
+
+ return package
+
+
+@pydantic.validate_call(validate_return=True)
+def get_all_package_contents(package_name: str) -> list[Package]:
+ package_contents = []
+
+ sub_packages_stack: list[tuple[str, str | None]] = [(package_name, None)]
+
+ while sub_packages_stack:
+ current_package_name, parent_package_name = sub_packages_stack.pop()
+
+ current_package_hierarchy = current_package_name.split(".")
+
+ try:
+ current_package_loader = import_package(current_package_name)
+ except ImportError:
+ LOGGER.warning(f"{current_package_name=} could not be imported")
+
+ continue
+
+ try:
+ current_package = importlib.import_module(current_package_name)
+ except ImportError:
+ LOGGER.warning(f"{current_package_name=} could not be imported")
+
+ continue
+
+ current_package_sub_packages = []
+ current_package_modules = []
+
+ for _, name, ispkg in pkgutil.walk_packages(
+ path=current_package_loader.__path__, prefix=f"{current_package_loader.__name__}."
+ ):
+ if "tests" in name:
+ continue
+
+ if "." in name.removeprefix(f"{current_package_name}."):
+ continue
+
+ if ispkg:
+ current_package_sub_packages.append(name)
+ else:
+ current_package_modules.append(name)
+
+ package_contents.append(
+ Package(
+ package_name=current_package_hierarchy[-1],
+ package_qualified_name=current_package_name,
+ package_hierarchy=current_package_hierarchy,
+ parent_package_name=parent_package_name,
+ children_sub_packages_names=[
+ sub_package.removeprefix(f"{current_package_name}.")
+ for sub_package in current_package_sub_packages
+ ],
+ children_modules_names=[
+ module.removeprefix(f"{current_package_name}.")
+ for module in current_package_modules
+ ],
+ package_summary=getattr(current_package, "__doc__", None),
+ package_all_exports=getattr(current_package, "__all__", None),
+ )
+ )
+
+ for sub_package_name in current_package_sub_packages:
+ sub_packages_stack.append((sub_package_name, current_package_name)) # noqa: PERF401
+
+ return package_contents
+
+
+@pydantic.validate_call(validate_return=True)
+def get_all_module_contents(module_name: str) -> Module:
+ module_hierarchy = module_name.split(".")
+
+ module = importlib.import_module(module_name)
+
+ module_contents = inspect.getmembers(
+ module, predicate=lambda member: inspect.getmodule(member) == module
+ )
+
+ return Module(
+ module_name=module_hierarchy[-1],
+ module_qualified_name=module_name,
+ module_hierarchy=module_hierarchy,
+ package_name=".".join(module_hierarchy[:-1]),
+ module_members=[
+ ModuleMember(member_name=member[0], member_object=member[1])
+ for member in module_contents
+ ],
+ module_summary=inspect.getdoc(module),
+ module_all_exports=getattr(importlib.import_module(module_name), "__all__", None),
+ )
+
+
+@pydantic.validate_call(validate_return=True)
+def get_all_parameters_details(
+ signature: pydantic.InstanceOf[inspect.Signature],
+ docstring: pydantic.InstanceOf[NumpyDocString],
+) -> list[Parameter]:
+ parameter_signature = {
+ parameter.name: {
+ "parameter_default": parameter.default,
+ "parameter_annotation": parameter.annotation,
+ "parameter_kind": parameter.kind.description,
+ }
+ for _, parameter in signature.parameters.items()
+ }
+ parameter_docstring = {
+ parameter.name: {
+ "parameter_annotation": parameter.type,
+ "parameter_summary": " ".join(parameter.desc),
+ }
+ for parameter in docstring["Parameters"]
+ }
+
+ parameter_details = [
+ Parameter.model_validate(
+ {
+ "parameter_name": parameter_name,
+ "parameter_default": parameter_signature_details["parameter_default"],
+ "parameter_annotation": parameter_docstring.get(parameter_name, {}).get(
+ "parameter_annotation", None
+ )
+ or parameter_signature_details["parameter_annotation"],
+ "parameter_kind": parameter_signature_details["parameter_kind"],
+ "parameter_summary": parameter_docstring.get(parameter_name, {}).get(
+ "parameter_summary", None
+ ),
+ }
+ )
+ for parameter_name, parameter_signature_details in parameter_signature.items()
+ ]
+
+ return parameter_details
+
+
+@pydantic.validate_call(validate_return=True)
+def get_all_returns_details(
+ signature: pydantic.InstanceOf[inspect.Signature],
+ docstring: pydantic.InstanceOf[NumpyDocString],
+) -> Returns:
+ returns_signature = signature.return_annotation
+
+ if not docstring["Returns"]:
+ return Returns(returns_annotation=returns_signature)
+
+ returns_docstring = next(
+ {"returns_annotation": returns.type, "returns_summary": " ".join(returns.desc)}
+ for returns in docstring["Returns"]
+ )
+
+ return Returns(
+ returns_annotation=returns_docstring.get("returns_annotation", None) or returns_signature,
+ returns_summary=returns_docstring.get("returns_summary", None),
+ )
+
+
+@pydantic.validate_call(validate_return=True)
+def get_all_member_details(
+ module_name: str, member_name: str, member_object: typing.Any # noqa: ANN401
+) -> MemberDetails:
+ member_hierarchy = [*module_name.split("."), member_name]
+
+ member_details: dict[str, typing.Any] = {
+ "member_name": member_name,
+ "member_qualified_name": ".".join(member_hierarchy),
+ "member_hierarchy": member_hierarchy,
+ "member_module": member_hierarchy[-2],
+ }
+
+ member_details["member_docstring"] = inspect.getdoc(member_object) or ""
+ parsed_docstring = NumpyDocString(member_details["member_docstring"])
+
+ if isinstance(member_object, enum.EnumType):
+ member_details["member_type_details"] = EnumDetails(
+ member_type=MemberType.ENUM,
+ enum_members=[
+ EnumMember(enum_member_name=enum_member.name, enum_member_value=enum_member.value)
+ for enum_member in member_object
+ ],
+ )
+ elif inspect.isclass(member_object):
+ member_details["member_type_details"] = ClassDetails(
+ member_type=MemberType.CLASS,
+ class_parameters=get_all_parameters_details(
+ inspect.signature(member_object), parsed_docstring
+ ),
+ class_methods=[
+ Method(
+ method_name=method[0],
+ method_parameters=[
+ parameter
+ for parameter, _ in inspect.signature(method[1]).parameters.items()
+ ],
+ method_summary=inspect.getdoc(method[1]),
+ )
+ for method in inspect.getmembers(member_object, predicate=inspect.ismethod)
+ if not method[0].startswith("_")
+ ],
+ class_attributes=[
+ Attribute(attribute_name=attribute[0])
+ for attribute in inspect.getmembers(
+ member_object,
+ predicate=lambda member: not inspect.ismethod(member) and not callable(member),
+ )
+ if not attribute[0].startswith("_")
+ ],
+ class_summary=" ".join(
+ parsed_docstring["Summary"] + parsed_docstring["Extended Summary"]
+ ),
+ class_notes=" ".join(parsed_docstring["See Also"] + parsed_docstring["Notes"]),
+ )
+ elif callable(member_object):
+ member_details["member_type_details"] = FunctionDetails(
+ member_type=MemberType.FUNCTION,
+ function_parameters=get_all_parameters_details(
+ inspect.signature(member_object), parsed_docstring
+ ),
+ function_returns=get_all_returns_details(
+ inspect.signature(member_object), parsed_docstring
+ ),
+ function_summary=" ".join(
+ parsed_docstring["Summary"] + parsed_docstring["Extended Summary"]
+ ),
+ function_raises=[
+ Raises(raises_type=raises.type, raises_summary=" ".join(raises.desc))
+ for raises in parsed_docstring["Raises"]
+ ],
+ function_warns=[
+ Warns(warns_type=warns.type, warns_summary=" ".join(warns.desc))
+ for warns in parsed_docstring["Warns"]
+ ],
+ function_notes="".join(parsed_docstring["Notes"]),
+ function_references="".join(parsed_docstring["References"]),
+ function_examples="".join(parsed_docstring["Examples"]),
+ )
+
+ return MemberDetails.model_validate(member_details)
+
+
+__all__ = [
+ "get_all_member_details",
+ "get_all_module_contents",
+ "get_all_package_contents",
+ "get_all_parameters_details",
+ "import_package",
+]
diff --git a/src/generative_ai/dataset_generation/step_2_generation.py b/src/generative_ai/dataset_generation/step_2_generation.py
new file mode 100644
index 0000000..99debe8
--- /dev/null
+++ b/src/generative_ai/dataset_generation/step_2_generation.py
@@ -0,0 +1,2787 @@
+import inspect
+import logging
+import random
+
+import pydantic
+
+from .utils_generation import (
+ ClassDetails,
+ Dataset,
+ EnumDetails,
+ FunctionDetails,
+ MemberDetails,
+ MemberType,
+ Module,
+ Package,
+)
+
+random.seed(a=0)
+
+LOGGER = logging.getLogger(__name__)
+
+
+@pydantic.validate_call(validate_return=True)
+def enumerate_array_elements(array: list, attribute: str | None = None) -> str:
+ elements = []
+ for element in array:
+ if isinstance(element, str):
+ elements.append(element)
+ elif attribute is not None:
+ elements.append(getattr(element, attribute))
+ else:
+ LOGGER.error(f"Received {attribute=} along with {array=}")
+
+ raise ValueError("attribute must be non-null if array elements are not string")
+
+ return " ".join(f"{counter + 1}. {element}" for counter, element in enumerate(elements))
+
+
+@pydantic.validate_call(validate_return=True)
+def generate_package_dataset(package_contents: Package) -> Dataset: # noqa: PLR0915
+ package_name = package_contents.package_name
+ package_full_name = package_contents.package_qualified_name
+
+ package = f"'{package_name}' package"
+
+ package_retrieval_chunks: list[str] = [f"'{package_name}' is a Python package."]
+ package_tuning_pairs: list[tuple[str, str]] = []
+
+ if (parent_package := package_contents.parent_package_name) is None:
+ root_package_pairs = [
+ ("What is the root package?", f"'{package_name}' is the root package."),
+ (
+ "Can you tell me what the root package is?",
+ f"Sure, the root package is '{package_name}'.",
+ ),
+ (
+ "I'm trying to find out the root package. Can you help?",
+ f"Of course, the root package is '{package_name}'.",
+ ),
+ (
+ "Do you know what the root package is?",
+ f"Yes, the root package is '{package_name}'.",
+ ),
+ (
+ "I'd like to know the root package.",
+ f"The root package you're asking about is '{package_name}'.",
+ ),
+ (
+ "Could you identify the root package?",
+ f"Certainly, '{package_name}' is the root package.",
+ ),
+ ]
+ package_retrieval_chunks.append(f"'{package_name}' is the root package.")
+ package_tuning_pairs.extend(root_package_pairs)
+
+ parent_package_pairs = [
+ (
+ f"Name parent package of '{package_name}'.",
+ f"Being the root package, '{package_name}' has no parent package.",
+ ),
+ (
+ f"What is the parent package of '{package_name}'?",
+ f"The root package '{package_name}' does not have a parent package.",
+ ),
+ (
+ f"Can you tell me the parent package of '{package_name}'?",
+ f"'{package_name}' is a root package and therefore,"
+ " it does not have a parent package.",
+ ),
+ (
+ f"Could you identify the parent package of '{package_name}'?",
+ f"As a root package, '{package_name}' does not possess a parent package.",
+ ),
+ (
+ f"I'm looking for the parent package of '{package_name}'. Can you help?",
+ f"Sure, '{package_name}' is a root package, so it doesn't have a parent package.",
+ ),
+ (
+ f"Do you know the parent package of '{package_name}'?",
+ f"Yes, '{package_name}' is a root package and hence,"
+ " it doesn't have a parent package.",
+ ),
+ ]
+ package_retrieval_chunks.append(f"'{package_name}' has no parent package.")
+ package_tuning_pairs.extend(parent_package_pairs)
+ else:
+ parent_package_pairs = [
+ (
+ f"Name parent package of '{package_name}' sub-package.",
+ f"'{parent_package}' is the full name of its parent package.",
+ ),
+ (
+ f"What is the parent package of the '{package_name}' sub-package?",
+ f"The parent package of '{package_name}' is '{parent_package}'.",
+ ),
+ (
+ f"Could you tell me the parent package of '{package_name}'?",
+ f"Sure, the parent package of '{package_name}' is '{parent_package}'.",
+ ),
+ (
+ f"I need to know the parent package of '{package_name}'.",
+ f"The parent package of '{package_name}' is '{parent_package}'.",
+ ),
+ (
+ f"Identify the parent package for the '{package_name}' sub-package.",
+ f"The parent package for '{package_name}' is identified as '{parent_package}'.",
+ ),
+ (
+ f"Can you name the parent package of the '{package_name}' sub-package?",
+ f"Yes, the parent package of '{package_name}' is '{parent_package}'.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"'{package_name}' is part of parent package '{parent_package}'."
+ )
+ package_tuning_pairs.extend(parent_package_pairs)
+
+ package_full_name_pairs = [
+ (
+ f"Tell the full name of '{package_name}' sub-package.",
+ f"'{package_full_name}' is the fully qualified name of '{package_name}'.",
+ ),
+ (
+ f"What is the fully qualified name of the '{package_name}' sub-package?",
+ f"Fully qualified name of '{package_name}' sub-package is '{package_full_name}'.",
+ ),
+ (
+ f"Could you provide the full name of the '{package_name}' sub-package?",
+ f"Sure, the full name of '{package_name}' sub-package is '{package_full_name}'.",
+ ),
+ (
+ f"I need the full name of the '{package_name}' sub-package. Can you tell me?",
+ f"Of course, full name of '{package_name}' sub-package is '{package_full_name}'.",
+ ),
+ (
+ f"Can you inform me about the full name of the '{package_name}' sub-package?",
+ f"Certainly, full name of '{package_name}' sub-package is '{package_full_name}'.",
+ ),
+ (
+ f"Please, reveal the full name of the '{package_name}' sub-package.",
+ f"Absolutely, full name of '{package_name}' sub-package is '{package_full_name}'.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"Full name of '{package_name}' sub-package is '{package_full_name}'."
+ )
+ package_tuning_pairs.extend(package_full_name_pairs)
+
+ package_hierarchy = enumerate_array_elements(package_contents.package_hierarchy)
+ package_hierarchy_pairs = [
+ (
+ f"What is the hierarchy of {package}?",
+ f"The hierarchy of {package} is as follows: {package_hierarchy}.",
+ ),
+ (
+ f"Can you explain the hierarchy of the {package}?",
+ f"Sure, the hierarchy of the {package} is: {package_hierarchy}.",
+ ),
+ (
+ f"Could you describe the structure of the {package}?",
+ f"Of course, the structure of {package} is: {package_hierarchy}.",
+ ),
+ (
+ f"I need to understand the hierarchy of {package}. Can you help?",
+ f"Absolutely, the hierarchy of {package} is: {package_hierarchy}.",
+ ),
+ (
+ f"Please provide the hierarchy of the {package}.",
+ f"The hierarchy of the {package} is: {package_hierarchy}.",
+ ),
+ (
+ f"I'm interested in the structure of the {package}. What is it?",
+ f"The structure of {package} is as follows: {package_hierarchy}.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"Hierarchy of {package} is as follows: {package_hierarchy}."
+ )
+ package_tuning_pairs.extend(package_hierarchy_pairs)
+
+ if not (children_sub_packages := package_contents.children_sub_packages_names):
+ package_sub_package_pairs = [
+ (
+ f"List the sub-packages of {package}.",
+ f"{package} does not have any further sub-packages.",
+ ),
+ (
+ f"What are the sub-packages of the {package}?",
+ f"The {package} does not contain any sub-packages.",
+ ),
+ (
+ f"Could you tell me the sub-packages of {package}?",
+ f"I'm sorry, but the {package} doesn't have any sub-packages.",
+ ),
+ (
+ f"I need to know the sub-packages of {package}. Can you list them?",
+ f"Unfortunately, {package} doesn't include any sub-packages.",
+ ),
+ (
+ f"Can you provide a list of sub-packages for the {package}?",
+ f"There are no sub-packages in the {package}.",
+ ),
+ (
+ f"Identify the sub-packages of {package}.",
+ f"No sub-packages are present in the {package}.",
+ ),
+ ]
+ package_retrieval_chunks.append(f"{package} does not have any further sub-packages.")
+ package_tuning_pairs.extend(package_sub_package_pairs)
+ else:
+ children_sub_packages_count = len(children_sub_packages)
+ children_sub_packages_count_pairs = [
+ (
+ f"How many sub-packages are there in {package}?",
+ f"{package} has {children_sub_packages_count} many sub-packages.",
+ ),
+ (
+ f"What is the count of sub-packages in {package}?",
+ f"The count of sub-packages in {package} is {children_sub_packages_count}.",
+ ),
+ (
+ f"Could you tell me the number of sub-packages available in {package}?",
+ f"{package} has {children_sub_packages_count} sub-packages.",
+ ),
+ (
+ f"Please provide the count of sub-packages for {package}.",
+ f"Number of sub-packages in {package} is {children_sub_packages_count}.",
+ ),
+ (
+ f"Tell me the quantity of sub-packages present in {package}.",
+ f"{package} has {children_sub_packages_count} sub-packages.",
+ ),
+ (
+ f"Would you mind letting me know how many sub-packages {package} contains?",
+ f"{package} contains {children_sub_packages_count} sub-packages.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"{package} has {children_sub_packages_count} many sub-packages."
+ )
+ package_tuning_pairs.extend(children_sub_packages_count_pairs)
+
+ package_sub_packages = enumerate_array_elements(children_sub_packages)
+ package_sub_package_pairs = [
+ (
+ f"List the sub-packages of {package}.",
+ f"Sub-packages of {package} are as follows: {package_sub_packages}.",
+ ),
+ (
+ f"What are the sub-packages of the {package}?",
+ f"The {package} has the following sub-packages: {package_sub_packages}.",
+ ),
+ (
+ f"Could you tell me the sub-packages of {package}?",
+ f"Sure, the sub-packages of {package} are: {package_sub_packages}.",
+ ),
+ (
+ f"I need to know the sub-packages of {package}. Can you list them?",
+ f"Of course, the sub-packages of {package} are: {package_sub_packages}.",
+ ),
+ (
+ f"Please provide the sub-packages of {package}.",
+ f"The sub-packages of {package} are: {package_sub_packages}.",
+ ),
+ (
+ f"Can you enumerate the sub-packages of {package}?",
+ f"Certainly, the sub-packages of {package} are: {package_sub_packages}.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"Sub-packages of {package} are as follows: {package_sub_packages}."
+ )
+ package_tuning_pairs.extend(package_sub_package_pairs)
+
+ if not (children_modules := package_contents.children_modules_names):
+ package_module_pairs = [
+ (
+ f"What are the modules of {package}?",
+ f"{package} does not have any direct modules under itself.",
+ ),
+ (
+ f"Can you list the modules under the {package}?",
+ f"There are no direct modules under the {package}.",
+ ),
+ (
+ f"Does the {package} contain any modules?",
+ f"No, the {package} does not contain any direct modules.",
+ ),
+ (
+ f"I'm looking for the modules of {package}. Can you help?",
+ f"I'm sorry, but {package} does not have any direct modules.",
+ ),
+ (
+ f"Tell me about the modules of {package}.",
+ f"Actually, the {package} does not have any direct modules.",
+ ),
+ (
+ f"Are there any modules under the {package}?",
+ f"No, there aren't any direct modules under the {package}.",
+ ),
+ ]
+ package_retrieval_chunks.append(f"{package} does not have any further modules.")
+ package_tuning_pairs.extend(package_module_pairs)
+ else:
+ children_modules_count = len(children_modules)
+ children_modules_count_pairs = [
+ (
+ f"How many modules are there in {package}?",
+ f"{package} has {children_modules_count} many modules.",
+ ),
+ (
+ f"What is the count of modules in {package}?",
+ f"The count of modules in {package} is {children_modules_count}.",
+ ),
+ (
+ f"Could you tell me the number of modules available in {package}?",
+ f"{package} has {children_modules_count} modules.",
+ ),
+ (
+ f"Please provide the count of modules for {package}.",
+ f"The number of modules in {package} is {children_modules_count}.",
+ ),
+ (
+ f"Tell me the quantity of modules present in {package}.",
+ f"{package} has {children_modules_count} modules.",
+ ),
+ (
+ f"Would you mind letting me know how many modules {package} contains?",
+ f"{package} contains {children_modules_count} modules.",
+ ),
+ ]
+ package_retrieval_chunks.append(f"{package} has {children_modules_count} many modules.")
+ package_tuning_pairs.extend(children_modules_count_pairs)
+
+ package_modules = enumerate_array_elements(children_modules)
+ package_module_pairs = [
+ (
+ f"What are the modules of {package}?",
+ f"Direct modules under {package} are as follows: {package_modules}.",
+ ),
+ (
+ f"Can you list the modules of the {package}?",
+ f"Sure, the direct modules under {package} are: {package_modules}.",
+ ),
+ (
+ f"I need to know the modules of the {package}.",
+ f"The modules you're looking for in {package} are: {package_modules}.",
+ ),
+ (
+ f"Could you tell me what the modules of the {package} are?",
+ f"Of course, the modules under {package} are: {package_modules}.",
+ ),
+ (
+ f"I'm interested in the modules of the {package}.",
+ f"The modules in {package} are: {package_modules}.",
+ ),
+ (
+ f"What modules does the {package} contain?",
+ f"The {package} contains these modules: {package_modules}.",
+ ),
+ ]
+ package_retrieval_chunks.append(f"Modules of {package} are as follows: {package_modules}.")
+ package_tuning_pairs.extend(package_module_pairs)
+
+ if not (package_summary := package_contents.package_summary):
+ package_summary_pairs = [
+ (f"What does {package} do?", f"{package} does not have any documentation."),
+ (
+ f"Can you tell me the functionality of the {package}?",
+ f"Unfortunately, the {package} provides no documentation.",
+ ),
+ (
+ f"I'm curious about what the {package} does. Can you enlighten me?",
+ f"I'm sorry, but the {package} does not come with any documentation.",
+ ),
+ (
+ f"Could you explain the purpose of the {package}?",
+ f"Regrettably, the {package} lacks any form of documentation.",
+ ),
+ (
+ f"What's the role of the {package}?",
+ f"The {package} does not offer any documentation.",
+ ),
+ (
+ f"What functionality does the {package} provide?",
+ f"The {package} does not have any available documentation.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"Unfortunately, {package} currently does not have any documentation."
+ )
+ package_tuning_pairs.extend(package_summary_pairs)
+ else:
+ package_summary_pairs = [
+ (f"What does {package} do?", f"Its documentation is as follows: '{package_summary}'."),
+ (
+ f"Can you tell me about the {package}?",
+ f"Sure, here is its documentation: '{package_summary}'.",
+ ),
+ (
+ f"I'd like to know what the {package} does.",
+ f"Of course, here's the documentation for it: '{package_summary}'.",
+ ),
+ (
+ f"Could you explain the functionality of the {package}?",
+ f"Absolutely, the documentation states: '{package_summary}'.",
+ ),
+ (
+ f"What's the purpose of the {package}?",
+ f"The purpose is described in its documentation: '{package_summary}'.",
+ ),
+ (
+ f"I'm curious about the {package}, what does it do?",
+ f"Good question, its documentation reads: '{package_summary}'.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"The following is the documentation of {package}: '{package_summary}'."
+ )
+ package_tuning_pairs.extend(package_summary_pairs)
+
+ if not (package_exports := package_contents.package_all_exports):
+ package_members_pairs = [
+ (
+ f"What are the public members of the {package}?",
+ f"{package} does not have any public member exported through '__all__'.",
+ ),
+ (
+ f"Can you list the public members of the {package}?",
+ f"The {package} does not export any public members through '__all__'.",
+ ),
+ (
+ f"Are there any public members in the {package}?",
+ f"No, the {package} does not have any public members exported through '__all__'.",
+ ),
+ (
+ f"I'm looking for public members of {package}. Can you help?",
+ f"Sure, but the {package} does not export any public members through '__all__'.",
+ ),
+ (
+ f"Could you tell me the public members of the {package}?",
+ f"Unfortunately, the {package} does not have any public members"
+ " exported through '__all__'.",
+ ),
+ (
+ f"I'd like to know the public members of the {package}."
+ " Can you provide that information?",
+ f"I'm sorry, but the {package} does not have any public members"
+ " exported through '__all__'.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"{package} does not export anything publicly using __all__ variable."
+ )
+ package_tuning_pairs.extend(package_members_pairs)
+ else:
+ package_exports_count = len(package_exports)
+ package_exports_count_pairs = [
+ (
+ f"How many objects does {package} export publicly?",
+ f"{package} exports {package_exports_count} many objects using __all__.",
+ ),
+ (
+ f"What is the count of publicly exported objects in {package}?",
+ f"Count of publicly exported objects in {package} is {package_exports_count}.",
+ ),
+ (
+ f"Could you tell me the number of objects publicly exported by {package}?",
+ f"{package} exports {package_exports_count} objects using __all__.",
+ ),
+ (
+ f"Please provide the count of objects publicly exported by {package}.",
+ f"Number of objects publicly exported by {package} is {package_exports_count}.",
+ ),
+ (
+ f"Tell me the quantity of objects that {package} exports publicly.",
+ f"{package} exports {package_exports_count} objects using __all__.",
+ ),
+ (
+ f"Would you mind letting me know how many objects {package} publicly exports?",
+ f"{package} publicly exports {package_exports_count} objects.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"{package} has {package_exports_count} many public exports."
+ )
+ package_tuning_pairs.extend(package_exports_count_pairs)
+
+ package_public_members = enumerate_array_elements(package_exports)
+ package_members_pairs = [
+ (
+ f"What are the public members of the {package}?",
+ f"{package} publicly exports the following members using '__all__':"
+ f" {package_public_members}.",
+ ),
+ (
+ f"Can you list the public members of the {package}?",
+ f"Sure, the {package} publicly exports these members using '__all__':"
+ f" {package_public_members}.",
+ ),
+ (
+ f"I need to know the public members of the {package}. Can you tell me?",
+ f"Of course, the {package} publicly exports these members using '__all__':"
+ f" {package_public_members}.",
+ ),
+ (
+ f"Could you tell me what the {package} publicly exports?",
+ f"The {package} publicly exports the following members using '__all__':"
+ f" {package_public_members}.",
+ ),
+ (
+ f"I'm interested in the public members of the {package}. What are they?",
+ f"The {package} publicly exports these members using '__all__':"
+ f" {package_public_members}.",
+ ),
+ ]
+ package_retrieval_chunks.append(
+ f"{package} exports following public members using __all__: {package_public_members}."
+ )
+ package_tuning_pairs.extend(package_members_pairs)
+
+ package_dataset = Dataset(
+ retrieval_chunks=package_retrieval_chunks, tuning_pairs=package_tuning_pairs
+ )
+
+ return package_dataset
+
+
+@pydantic.validate_call(validate_return=True)
+def generate_module_dataset(module_members: Module) -> Dataset:
+ module_name = module_members.module_name
+ module_full_name = module_members.module_qualified_name
+ module = f"'{module_name}' module"
+
+ module_retrieval_chunks: list[str] = [f"'{module_name}' is a Python module."]
+ module_tuning_pairs: list[tuple[str, str]] = []
+
+ module_package_pairs = [
+ (
+ f"Can you tell the the parent package of {module}?",
+ f"'{module_members.package_name}' is the parent package of {module}.",
+ ),
+ (
+ f"What is the parent package of the {module}?",
+ f"The parent package of {module} is '{module_members.package_name}'.",
+ ),
+ (
+ f"I'm trying to find the parent package of the {module}. Can you help?",
+ f"Sure, parent package of {module} is '{module_members.package_name}'.",
+ ),
+ (
+ f"Could you inform me about the parent package of the {module}?",
+ f"Certainly, '{module_members.package_name}' is the parent package of the {module}.",
+ ),
+ (
+ f"I need to know the parent package of {module}. Can you provide that information?",
+ f"Absolutely, the parent package of the {module} is '{module_members.package_name}'.",
+ ),
+ (
+ f"Can you identify the parent package for the {module}?",
+ f"Yes, parent package for {module} is '{module_members.package_name}'.",
+ ),
+ ]
+ module_retrieval_chunks.append(
+ f"{module} is part of parent package '{module_members.package_name}'."
+ )
+ module_tuning_pairs.extend(module_package_pairs)
+
+ module_full_name_pairs = [
+ (
+ f"Specify the full name of {module}?",
+ f"'{module_full_name}' is fully qualified name for {module}.",
+ ),
+ (
+ f"What is the fully qualified name for the {module}?",
+ f"The fully qualified name for the {module} is '{module_full_name}'.",
+ ),
+ (
+ f"Could you tell me the full name of the {module}?",
+ f"Sure, the full name of the {module} is '{module_full_name}'.",
+ ),
+ (
+ f"I need the full name of the {module}. Can you provide it?",
+ f"Of course, the full name of the {module} is '{module_full_name}'.",
+ ),
+ (
+ f"Can you specify the fully qualified name of the {module}?",
+ f"Yes, fully qualified name of the {module} is '{module_full_name}'.",
+ ),
+ (
+ f"I'm looking for the full name of the {module}. What is it?",
+ f"Full name of the {module} you're looking for is '{module_full_name}'.",
+ ),
+ ]
+ module_retrieval_chunks.append(f"Full name of {module} is '{module_full_name}'.")
+ module_tuning_pairs.extend(module_full_name_pairs)
+
+ module_hierarchy = enumerate_array_elements(module_members.module_hierarchy)
+ module_hierarchy_pairs = [
+ (
+ f"What is the hierarchy of {module}?",
+ f"The hierarchy of {module} is as follows: {module_hierarchy}.",
+ ),
+ (
+ f"Can you explain the hierarchy of the {module}?",
+ f"Sure, the hierarchy of the {module} is: {module_hierarchy}.",
+ ),
+ (
+ f"Could you describe the structure of the {module}?",
+ f"Of course, the structure of the {module} is: {module_hierarchy}.",
+ ),
+ (
+ f"I need to understand the hierarchy of the {module}. Can you help?",
+ f"Absolutely, the hierarchy of the {module} is: {module_hierarchy}.",
+ ),
+ (
+ f"Please provide the hierarchy of the {module}.",
+ f"The hierarchy of the {module} is: {module_hierarchy}.",
+ ),
+ (
+ f"What does the hierarchy of the {module} look like?",
+ f"The hierarchy of the {module} looks like this: {module_hierarchy}.",
+ ),
+ ]
+ module_retrieval_chunks.append(f"Hierarchy of {module} is as follows: {module_hierarchy}.")
+ module_tuning_pairs.extend(module_hierarchy_pairs)
+
+ module_members_count = len(module_members.module_members)
+ module_members_count_pairs = [
+ (
+ f"How many members does {module} have?",
+ f"{module} has {module_members_count} many members.",
+ ),
+ (
+ f"What is the count of members in {module}?",
+ f"The count of members in {module} is {module_members_count}.",
+ ),
+ (
+ f"Could you tell me the number of members in {module}?",
+ f"{module} has {module_members_count} members.",
+ ),
+ (
+ f"Please provide the count of members for {module}.",
+ f"The number of members in {module} is {module_members_count}.",
+ ),
+ (
+ f"Tell me the quantity of members present in {module}.",
+ f"{module} has {module_members_count} members.",
+ ),
+ (
+ f"Would you mind letting me know how many members {module} contains?",
+ f"{module} contains {module_members_count} members.",
+ ),
+ ]
+ module_retrieval_chunks.append(f"{module} has {module_members_count} many members.")
+ module_tuning_pairs.extend(module_members_count_pairs)
+
+ module_member_names = enumerate_array_elements(
+ module_members.module_members, attribute="member_name"
+ )
+ module_members_pairs = [
+ (
+ f"List the members of {module}.",
+ f"Members of {module} are as follows: {module_member_names}.",
+ ),
+ (
+ f"What are the members of the {module}?",
+ f"The {module} has the following members: {module_member_names}.",
+ ),
+ (
+ f"Can you tell me the members of the {module}?",
+ f"Sure, the members of the {module} are: {module_member_names}.",
+ ),
+ (
+ f"I need to know the members of the {module}.",
+ f"Members of {module} you asked for are: {module_member_names}.",
+ ),
+ (
+ f"Could you list the members of the {module}?",
+ f"Of course, members of the {module} are: {module_member_names}.",
+ ),
+ (
+ f"Please provide the members of the {module}.",
+ f"Members of {module} you requested are: {module_member_names}.",
+ ),
+ ]
+ module_retrieval_chunks.append(f"Members of {module} are as follows: {module_member_names}.")
+ module_tuning_pairs.extend(module_members_pairs)
+
+ if not (module_summary := module_members.module_summary):
+ module_summary_pairs = [
+ (f"What is the {module} for?", f"{module} does not have any documentation."),
+ (
+ f"Can you tell me the purpose of the {module}?",
+ f"The {module} lacks any documentation.",
+ ),
+ (
+ f"I'd like to know what the {module} is used for.",
+ f"Unfortunately, there is no documentation for the {module}.",
+ ),
+ (
+ f"Could you explain the function of the {module}?",
+ f"Regrettably, the {module} doesn't come with any documentation.",
+ ),
+ (f"What does the {module} do?", f"The {module} is without any documentation."),
+ ]
+ module_retrieval_chunks.append(
+ f"Unfortunately, {module} currently does not have any documentation."
+ )
+ module_tuning_pairs.extend(module_summary_pairs)
+ else:
+ module_summary_pairs = [
+ (
+ f"What is the '{module_name}' module for?",
+ f"{module} documents itself as follows: '{module_summary}'.",
+ ),
+ (
+ f"Can you tell me the purpose of the '{module_name}' module?",
+ f"Purpose of {module} is documented as: '{module_summary}'.",
+ ),
+ (
+ f"I'm curious about the '{module_name}' module. What does it do?",
+ f"The {module} is described as: '{module_summary}'.",
+ ),
+ (
+ f"Could you explain the functionality of the '{module_name}' module?",
+ f"The functionality of the {module} is described as: '{module_summary}'.",
+ ),
+ (
+ f"I'd like to know more about the '{module_name}' module. What's its role?",
+ f"The role of the {module} is: '{module_summary}'.",
+ ),
+ (
+ f"What's the use of the '{module_name}' module?",
+ f"Use of the {module} is documented as: '{module_summary}'.",
+ ),
+ ]
+ module_retrieval_chunks.append(
+ f"The following is the documentation of {module}: {module_summary}."
+ )
+ module_tuning_pairs.extend(module_summary_pairs)
+
+ if not (module_exports := module_members.module_all_exports):
+ module_exports_pairs = [
+ (
+ f"Tell me the public members of the {module}.",
+ f"{module} lacks any public member exported through '__all__'.",
+ ),
+ (
+ f"What are the public members of the {module}?",
+ "There are no public members exported through '__all__' in the {module}.",
+ ),
+ (
+ f"Could you list the public members of the {module}?",
+ f"Unfortunately, {module} does not export any public members through '__all__'.",
+ ),
+ (
+ f"I need to know the public members of the {module}.",
+ f"The {module} does not have any public members exported through '__all__'.",
+ ),
+ (
+ f"Can you show me the public members of the {module}?",
+ f"The {module} does not contain any public members exported through '__all__'.",
+ ),
+ (
+ f"I'm interested in the public members of the {module}. What are they?",
+ f"{module} does not export any public members through '__all__'.",
+ ),
+ ]
+ module_retrieval_chunks.append(
+ f"{module} does not export anything publicly using __all__ variable."
+ )
+ module_tuning_pairs.extend(module_exports_pairs)
+ else:
+ module_exports_count = len(module_exports)
+ module_exports_count_pairs = [
+ (
+ f"How many objects does {module} export publicly?",
+ f"{module} exports {module_exports_count} many objects using __all__.",
+ ),
+ (
+ f"What is the count of publicly exported objects in {module}?",
+ f"The count of publicly exported objects in {module} is {module_exports_count}.",
+ ),
+ (
+ f"Could you tell me the number of objects publicly exported by {module}?",
+ f"{module} exports {module_exports_count} objects using __all__.",
+ ),
+ (
+ f"Please provide the count of objects publicly exported by {module}.",
+ f"The number of objects publicly exported by {module} is {module_exports_count}.",
+ ),
+ (
+ f"Tell me the quantity of objects that {module} exports publicly.",
+ f"{module} exports {module_exports_count} objects using __all__.",
+ ),
+ (
+ f"Would you mind letting me know how many objects {module} publicly exports?",
+ f"{module} publicly exports {module_exports_count} objects.",
+ ),
+ ]
+ module_retrieval_chunks.append(f"{module} has {module_exports_count} many public exports.")
+ module_tuning_pairs.extend(module_exports_count_pairs)
+
+ module_public_exports = enumerate_array_elements(module_exports)
+ module_exports_pairs = [
+ (
+ f"Tell me the public members of the {module}.",
+ f"{module} publicly exports the following members using '__all__':"
+ f" {module_public_exports}.",
+ ),
+ (
+ f"What are the public members of the {module}?",
+ f"The {module} publicly exports the following members using '__all__':"
+ f" {module_public_exports}.",
+ ),
+ (
+ f"Could you list the public members of the {module}?",
+ f"Sure, the {module} publicly exports these members using '__all__':"
+ f" {module_public_exports}.",
+ ),
+ (
+ f"I need to know the public members of the {module}.",
+ f"The {module} publicly exports these members using '__all__':"
+ f" {module_public_exports}.",
+ ),
+ (
+ f"Can you show me the public members of the {module}?",
+ f"Of course, the {module} publicly exports the following members using '__all__':"
+ f" {module_public_exports}.",
+ ),
+ ]
+ module_retrieval_chunks.append(
+ f"{module} exports following members using __all__: {module_public_exports}."
+ )
+ module_tuning_pairs.extend(module_exports_pairs)
+
+ module_dataset = Dataset(
+ retrieval_chunks=module_retrieval_chunks, tuning_pairs=module_tuning_pairs
+ )
+
+ return module_dataset
+
+
+@pydantic.validate_call(validate_return=True)
+def generate_enum_member_dataset(
+ enum_member: str, enum_docstring: str, member_type_details: EnumDetails
+) -> tuple[Dataset, list[str]]:
+ enum_member_retrieval_chunks: list[str] = [
+ f"{enum_member} is a Python enum.",
+ f"{enum_member} has following docstring: {enum_docstring}.",
+ ]
+ enum_member_tuning_pairs: list[tuple[str, str]] = []
+
+ enum_member_count = len(member_type_details.enum_members)
+ enum_member_count_pairs = [
+ (
+ f"How many members are there in {enum_member}?",
+ f"{enum_member} has {enum_member_count} members.",
+ ),
+ (
+ f"What is the count of members in {enum_member}?",
+ f"The count of members in {enum_member} is {enum_member_count}.",
+ ),
+ (
+ f"Can you tell me the number of members in {enum_member}?",
+ f"Sure, the number of members in {enum_member} is {enum_member_count}.",
+ ),
+ (
+ f"Could you provide the total number of members in {enum_member}?",
+ f"The total number of members in {enum_member} is {enum_member_count}.",
+ ),
+ (
+ f"I need to know the quantity of members in {enum_member}.",
+ f"The quantity of members in {enum_member} is {enum_member_count}.",
+ ),
+ (
+ f"Please inform me about the number of members in {enum_member}.",
+ f"The number of members in {enum_member} is {enum_member_count}.",
+ ),
+ ]
+ enum_member_retrieval_chunks.insert(-1, f"{enum_member} has {enum_member_count} many members.")
+ enum_member_tuning_pairs.extend(enum_member_count_pairs)
+
+ enum_members = enumerate_array_elements(
+ member_type_details.enum_members, attribute="enum_member"
+ )
+ enum_members_pairs = [
+ (
+ f"What are the different members of {enum_member}?",
+ f"Different members of {enum_member} are as follows: {enum_members}.",
+ ),
+ (
+ f"Can you list the different members of {enum_member}?",
+ f"Sure, the different members of {enum_member} are: {enum_members}.",
+ ),
+ (
+ f"Could you tell me the different members of {enum_member}?",
+ f"Of course, the different members of {enum_member} include: {enum_members}.",
+ ),
+ (
+ f"I need to know the different members of {enum_member}.",
+ f"The different members of {enum_member} are: {enum_members}.",
+ ),
+ (
+ f"What does {enum_member} consist of?",
+ f"{enum_member} consists of the following members: {enum_members}.",
+ ),
+ ]
+ enum_member_retrieval_chunks.insert(
+ -1, f"Members of {enum_member} are as follows: {enum_members}."
+ )
+ enum_member_tuning_pairs.extend(enum_members_pairs)
+
+ enum_member_names = enumerate_array_elements(
+ member_type_details.enum_members, attribute="enum_member_name"
+ )
+ enum_member_names_pairs = [
+ (
+ f"List just the names of different members of {enum_member}.",
+ f"Different members of {enum_member} have the following names: {enum_member_names}.",
+ ),
+ (
+ f"Can you provide the names of different members of {enum_member}?",
+ f"Sure, different members of {enum_member} are named as follows: {enum_member_names}.",
+ ),
+ (
+ f"What are the names of different members of {enum_member}?",
+ f"The names of different members of {enum_member} are: {enum_member_names}.",
+ ),
+ (
+ f"I need the names of different members of {enum_member}.",
+ f"The different members of {enum_member} have these names: {enum_member_names}.",
+ ),
+ (
+ f"Could you list the names of different members of {enum_member}?",
+ f"Of course, different members of {enum_member} have these names:"
+ f" {enum_member_names}.",
+ ),
+ (
+ f"Show me the names of different members of {enum_member}.",
+ f"The names of different members of {enum_member} are: {enum_member_names}.",
+ ),
+ ]
+ enum_member_retrieval_chunks.insert(
+ -1, f"Names of different members of {enum_member} are as follows: {enum_member_names}."
+ )
+ enum_member_tuning_pairs.extend(enum_member_names_pairs)
+
+ enum_member_values = enumerate_array_elements(
+ member_type_details.enum_members, attribute="enum_member_value"
+ )
+ enum_member_values_pairs = [
+ (
+ f"Only show the different values supported by {enum_member}.",
+ f"{enum_member} supports the following values: {enum_member_values}.",
+ ),
+ (
+ f"What are the different values that {enum_member} supports?",
+ f"The different values that {enum_member} supports are: {enum_member_values}.",
+ ),
+ (
+ f"Can you list the values supported by {enum_member}?",
+ f"Sure, {enum_member} supports these values: {enum_member_values}.",
+ ),
+ (
+ f"I need to know the values supported by {enum_member}.",
+ f"{enum_member} supports these values: {enum_member_values}.",
+ ),
+ (
+ f"Could you tell me the values that {enum_member} supports?",
+ f"Of course, the values that {enum_member} supports are: {enum_member_values}.",
+ ),
+ (
+ f"Please provide the values supported by {enum_member}.",
+ f"The values supported by {enum_member} are: {enum_member_values}.",
+ ),
+ ]
+ enum_member_retrieval_chunks.insert(
+ -1, f"Values of different members of {enum_member} are as follows: {enum_member_values}."
+ )
+ enum_member_tuning_pairs.extend(enum_member_values_pairs)
+
+ enum_member_dataset = Dataset(
+ retrieval_chunks=enum_member_retrieval_chunks, tuning_pairs=enum_member_tuning_pairs
+ )
+
+ return enum_member_dataset, enum_member_retrieval_chunks
+
+
+@pydantic.validate_call(validate_return=True)
+def generate_class_member_dataset( # noqa: C901, PLR0912, PLR0915
+ class_member: str, class_docstring: str, member_type_details: ClassDetails
+) -> tuple[Dataset, list[str]]:
+ class_member_retrieval_chunks: list[str] = [
+ f"{class_member} is a Python class.",
+ f"{class_member} has following docstring: {class_docstring}.",
+ ]
+ class_member_tuning_pairs: list[tuple[str, str]] = []
+
+ if not (class_parameters := member_type_details.class_parameters):
+ class_parameters_pairs = [
+ (
+ f"What are the different parameters of {class_member}?",
+ f"{class_member} needs no arguments for instantiation.",
+ ),
+ (
+ f"Can you tell me the parameters required for {class_member}?",
+ f"No parameters are required for instantiating {class_member}.",
+ ),
+ (
+ f"What arguments do I need to instantiate {class_member}?",
+ f"You don't need any arguments to instantiate {class_member}.",
+ ),
+ (
+ f"Do I need any parameters to use {class_member}?",
+ f"{class_member} can be used without any parameters.",
+ ),
+ (
+ f"What should I pass as arguments when creating an instance of {class_member}?",
+ "There's no need to pass any arguments"
+ f" when creating an instance of {class_member}.",
+ ),
+ (
+ f"Are there any parameters needed for the instantiation of {class_member}?",
+ f"The instantiation of {class_member} doesn't require any parameters.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{class_member} requires no arguments for instantiation."
+ )
+ class_member_tuning_pairs.extend(class_parameters_pairs)
+ else:
+ class_parameter_names = enumerate_array_elements(
+ class_parameters, attribute="parameter_details"
+ )
+ class_parameters_pairs = [
+ (
+ f"What are the different parameters of {class_member}?",
+ f"{class_member} supports these arguments to initiate"
+ f" a new instance: {class_parameter_names}.",
+ ),
+ (
+ f"Can you list the parameters for {class_member}?",
+ f"Sure, {class_member} can be initiated with these arguments:"
+ f" {class_parameter_names}.",
+ ),
+ (
+ f"I need to know the parameters of {class_member}.",
+ f"The parameters to initiate a new instance of {class_member} are:"
+ f" {class_parameter_names}.",
+ ),
+ (
+ f"Tell me the parameters that {class_member} supports.",
+ f"{class_member} can be initiated with these arguments: {class_parameter_names}.",
+ ),
+ (
+ f"What arguments does {class_member} take for initialisation?",
+ f"To initialise {class_member}, you can use these arguments:"
+ f" {class_parameter_names}.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{class_member} requires the following arguments for initialisation:"
+ f" {class_parameter_names}"
+ )
+ class_member_tuning_pairs.extend(class_parameters_pairs)
+
+ for class_parameter in class_parameters:
+ parameter_name = class_parameter.parameter_name
+ parameter = f"'{parameter_name}' argument in {class_member}"
+
+ if (parameter_default := class_parameter.parameter_default) is inspect._empty:
+ class_parameter_defaults_pairs = [
+ (
+ f"Tell default value of {parameter}.",
+ f"{parameter} does not have a default value.",
+ ),
+ (
+ f"What is the default value of {parameter}?",
+ f"The {parameter} does not have a default value.",
+ ),
+ (
+ f"Could you inform me about default value of {parameter}?",
+ f"Sure, the {parameter} does not have a default value.",
+ ),
+ (
+ f"I need to know the default value of {parameter}. Can you help?",
+ f"Of course, the {parameter} does not have a default value.",
+ ),
+ (
+ f"Can you tell me if {parameter} has default value?",
+ f"No, the {parameter} does not have a default value.",
+ ),
+ (
+ f"I'm curious about default value of {parameter}.",
+ f"Well, the {parameter} does not have a default value.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(f"{parameter} does not have a default value.")
+ class_member_tuning_pairs.extend(class_parameter_defaults_pairs)
+ else:
+ class_parameter_defaults_pairs = [
+ (
+ f"Tell default value of {parameter}.",
+ f"{parameter} takes {parameter_default} by default.",
+ ),
+ (
+ f"What is the default value of {parameter}?",
+ f"The default value of {parameter} is {parameter_default}.",
+ ),
+ (
+ f"Could you inform me about default value of {parameter}?",
+ f"Sure, the default value of {parameter} is {parameter_default}.",
+ ),
+ (
+ f"I need to know the default value of {parameter}.",
+ f"The default value of {parameter} is {parameter_default}.",
+ ),
+ (
+ f"Can you provide default value of {parameter}?",
+ f"Yes, default value of {parameter} is {parameter_default}.",
+ ),
+ (
+ f"Please, disclose default value of {parameter}.",
+ f"Certainly, the default value of {parameter} is {parameter_default}.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{parameter_default} is the default value of {parameter}."
+ )
+ class_member_tuning_pairs.extend(class_parameter_defaults_pairs)
+
+ if (parameter_annotation := class_parameter.parameter_annotation) is inspect._empty:
+ class_parameter_types_pairs = [
+ (
+ f"Name type hint for {parameter}.",
+ f"{parameter} does not have a type annotation.",
+ ),
+ (
+ f"What is the type hint for {parameter}?",
+ f"There is no type annotation for the {parameter}.",
+ ),
+ (
+ f"Can you tell me the type hint for {parameter}?",
+ f"The {parameter} is not annotated with a type.",
+ ),
+ (
+ f"I'm looking for the type hint for {parameter}. Can you help?",
+ f"Sure, the {parameter} does not have a type annotation.",
+ ),
+ (
+ f"Could you provide the type hint for {parameter}?",
+ f"Unfortunately, {parameter} does not have type annotation.",
+ ),
+ (
+ f"I need to know the type hint for {parameter}.",
+ f"The {parameter} does not come with a type annotation.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(f"Type hint for {parameter} is unavailable.")
+ class_member_tuning_pairs.extend(class_parameter_types_pairs)
+ else:
+ class_parameter_types_pairs = [
+ (
+ f"Name type hint for {parameter}.",
+ f"{parameter} has '{parameter_annotation}' as type hint.",
+ ),
+ (
+ f"What is the type hint for {parameter}?",
+ f"The type hint for {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"Could you tell me the type hint for {parameter}?",
+ f"Sure, the type hint for {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"I need to know the type hint for {parameter}.",
+ f"The type hint for {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"Identify the type hint for {parameter}.",
+ f"The type hint for {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"Can you specify the type hint for {parameter}?",
+ f"Yes, the type hint for {parameter} is '{parameter_annotation}'.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{parameter} is annotated as '{parameter_annotation}' type."
+ )
+ class_member_tuning_pairs.extend(class_parameter_types_pairs)
+
+ if not (parameter_summary := class_parameter.parameter_summary):
+ class_parameter_summary_pairs = [
+ (
+ f"What does {parameter} do?",
+ f"Docstring of {class_member} does not describe '{parameter_name}'.",
+ ),
+ (
+ f"Can you explain the role of {parameter}?",
+ f"The docstring of {class_member} does not provide any information about"
+ f" '{parameter_name}'.",
+ ),
+ (
+ f"I'm trying to understand what {parameter} does. Can you help?",
+ f"Unfortunately, the docstring of {class_member} does not mention anything"
+ f" about '{parameter_name}'.",
+ ),
+ (
+ f"What is the function of {parameter}?",
+ f"There is no description of '{parameter_name}' in the docstring of"
+ f" {class_member}.",
+ ),
+ (
+ f"Could you tell me what '{parameter_name}' does in {class_member}?",
+ f"The docstring of {class_member} does not contain any details about"
+ f" '{parameter_name}'.",
+ ),
+ (
+ f"I'm curious about the purpose of {parameter}. Can you enlighten me?",
+ f"I'm sorry, but the docstring of {class_member} does not discuss"
+ f" '{parameter_name}'.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{parameter} lacks any documentation in the docstring."
+ )
+ class_member_tuning_pairs.extend(class_parameter_summary_pairs)
+ else:
+ class_parameter_summary_pairs = [
+ (
+ f"What does {parameter} do?",
+ f"{class_member} documents role of '{parameter_name}' as follows:"
+ f" '{parameter_summary}'.",
+ ),
+ (
+ f"Can you explain the role of {parameter}?",
+ f"Sure, {class_member} describes '{parameter_name}' as follows:"
+ f" '{parameter_summary}'.",
+ ),
+ (
+ f"I'm curious about {parameter}. What does it do?",
+ f"In {class_member}, '{parameter_name}' is documented as follows:"
+ f" '{parameter_summary}'.",
+ ),
+ (
+ f"Could you tell me what {parameter} does?",
+ f"Of course, {parameter} is described as follows: '{parameter_summary}'.",
+ ),
+ (
+ f"What's the function of {parameter}?",
+ f"{class_member} describes the function of '{parameter_name}' as follows:"
+ f" '{parameter_summary}'.",
+ ),
+ (
+ f"I'd like to know the purpose of {parameter}.",
+ f"In {class_member}, the purpose of '{parameter_name}' is defined as follows:"
+ f" '{parameter_summary}'.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"As per docstring, role of {parameter} is: '{parameter_summary}'."
+ )
+ class_member_tuning_pairs.extend(class_parameter_summary_pairs)
+
+ if not (class_methods := member_type_details.class_methods):
+ class_method_names_pairs = [
+ (
+ f"List names of the public methods of {class_member}.",
+ f"{class_member} does not have any public methods (not starting with '_').",
+ ),
+ (
+ f"Can you provide the names of the public methods for {class_member}?",
+ f"Unfortunately, {class_member} does not have any public methods.",
+ ),
+ (
+ f"What are the public methods of {class_member}?",
+ f"There are no public methods (not starting with '_') in {class_member}.",
+ ),
+ (
+ f"I need to know the public methods of {class_member}. Can you list them?",
+ f"I'm sorry, but {class_member} does not have any public methods.",
+ ),
+ (
+ f"Could you list the public methods of {class_member}?",
+ f"{class_member} does not contain any public methods (not starting with '_').",
+ ),
+ (
+ f"Show me the public methods of {class_member}.",
+ f"It appears that {class_member} does not have any public methods.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{class_member} has no public (without _ as the prefix) methods."
+ )
+ class_member_tuning_pairs.extend(class_method_names_pairs)
+ else:
+ class_methods_count = len(class_methods)
+ class_methods_count_pairs = [
+ (
+ f"How many public methods does {class_member} have?",
+ f"{class_member} has {class_methods_count} many public methods.",
+ ),
+ (
+ f"What is the count of public methods in {class_member}?",
+ f"The count of public methods in {class_member} is {class_methods_count}.",
+ ),
+ (
+ f"Could you tell me the number of public methods in {class_member}?",
+ f"{class_member} has {class_methods_count} public methods.",
+ ),
+ (
+ f"Please provide the count of public methods for {class_member}.",
+ f"The number of public methods in {class_member} is {class_methods_count}.",
+ ),
+ (
+ f"Tell me the quantity of public methods present in {class_member}.",
+ f"{class_member} has {class_methods_count} public methods.",
+ ),
+ (
+ f"Would you mind letting me know how many public methods {class_member} contains?",
+ f"{class_member} contains {class_methods_count} public methods.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{class_member} has {class_methods_count} many public methods."
+ )
+ class_member_tuning_pairs.extend(class_methods_count_pairs)
+
+ class_public_methods = enumerate_array_elements(class_methods, attribute="method_name")
+ class_method_names_pairs = [
+ (
+ f"List names of the public methods of {class_member}.",
+ f"Here are the public methods of {class_member}: {class_public_methods}.",
+ ),
+ (
+ f"Can you provide the names of the public methods for {class_member}?",
+ f"Sure, the public methods of {class_member} that do not start with '_' are:"
+ f" {class_public_methods}.",
+ ),
+ (
+ f"What are the public methods of {class_member}?",
+ f"The public methods of {class_member} (excluding those starting with '_') are:"
+ f" {class_public_methods}.",
+ ),
+ (
+ f"I need to know the public methods of {class_member}.",
+ f"The public methods of {class_member} (those not starting with '_') are:"
+ f" {class_public_methods}.",
+ ),
+ (
+ f"Could you list the public methods of {class_member}?",
+ f"Of course, the public methods of {class_member} (not beginning with '_') are:"
+ f" {class_public_methods}.",
+ ),
+ (
+ f"Please show me the public methods of {class_member}.",
+ f"Here you go, the public methods of {class_member}"
+ f" (excluding those with a prefix '_') are: {class_public_methods}.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{class_member} has the following public methods: {class_public_methods}"
+ )
+ class_member_tuning_pairs.extend(class_method_names_pairs)
+
+ for class_method in class_methods:
+ method_name = class_method.method_name
+ method = f"'{method_name}' method of {class_member}"
+
+ if not (method_parameters := class_method.method_parameters):
+ class_method_parameters_pairs = [
+ (f"What arguments do {method} accept?", f"{method} does not take any parameters."),
+ (
+ f"Can you tell me the parameters that {method} requires?",
+ f"The {method} does not require any parameters.",
+ ),
+ (
+ f"What are the inputs for the {method} in {class_member}?",
+ f"There are no inputs for the {method} in {class_member}.",
+ ),
+ (
+ f"Does the {method} need any arguments?",
+ f"No, {method} does not need any arguments.",
+ ),
+ (
+ f"What parameters should I pass to {method}?",
+ f"You don't need to pass any parameters to the {method}.",
+ ),
+ (
+ f"What are required arguments for {method}?",
+ f"{method} does not require any arguments.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(f"{method} takes no arguments.")
+ class_member_tuning_pairs.extend(class_method_parameters_pairs)
+ else:
+ class_method_parameters = enumerate_array_elements(method_parameters)
+ class_method_parameters_pairs = [
+ (
+ f"What arguments do {method} accept?",
+ f"{method} takes the following parameters: {class_method_parameters}.",
+ ),
+ (
+ f"Can you tell me the parameters that {method} requires?",
+ f"Sure, {method} requires these parameters: {class_method_parameters}.",
+ ),
+ (
+ f"I need to know arguments for {method}.",
+ f"The {method} has these arguments: {class_method_parameters}.",
+ ),
+ (
+ f"What are the parameters for '{method}'?",
+ f"The parameters for {method} are: {class_method_parameters}.",
+ ),
+ (
+ f"Could you list the arguments that the {method} takes?",
+ f"Certainly, the {method} takes these arguments: {class_method_parameters}.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{method} accepts following parameters: {class_method_parameters}"
+ )
+ class_member_tuning_pairs.extend(class_method_parameters_pairs)
+
+ if not (method_summary := class_method.method_summary):
+ class_method_summary_pairs = [
+ (f"What does {method} do?", f"Docstring of {method} is missing."),
+ (
+ f"Can you explain functionality of {method}?",
+ f"The docstring for {method} is not available.",
+ ),
+ (
+ f"I'm trying to understand what {method} does. Can you help?",
+ f"Unfortunately, the docstring for {method} is not provided.",
+ ),
+ (
+ f"Could you describe the role of {method}?",
+ f"There is no docstring available for {method}.",
+ ),
+ (
+ f"I'm not sure what {method} does. Can you clarify?",
+ f"The {method} lacks a docstring.",
+ ),
+ (f"What's the purpose of {method}?", f"The {method} doesn't have a docstring."),
+ ]
+ class_member_retrieval_chunks.append(f"Unfortunately, {method} is not documented.")
+ class_member_tuning_pairs.extend(class_method_summary_pairs)
+ else:
+ class_method_summary_pairs = [
+ (
+ f"What does {method} do?",
+ f"Based on method docstring, its role is to '{method_summary}'.",
+ ),
+ (
+ f"Can you explain the function of {method}?",
+ f"Sure, according to method docstring, it is designed to '{method_summary}'.",
+ ),
+ (
+ f"I'm curious about the {method}. What's its purpose?",
+ f"Well, if we look at the docstring of {method}, we can see that it's meant to"
+ f" '{method_summary}'.",
+ ),
+ (
+ f"Could you tell me what the {method} does?",
+ f"Of course, the docstring of {method} indicates that its function is to"
+ f" '{method_summary}'.",
+ ),
+ (
+ f"I'd like to understand role of {method}.",
+ f"Certainly, method docstring reveals that its job is to '{method_summary}'.",
+ ),
+ (
+ f"What's the functionality of the {method}?",
+ f"As per the method docstring, it's designed to '{method_summary}'.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"Based on docstring, {method} has the purpose of '{method_summary}'."
+ )
+ class_member_tuning_pairs.extend(class_method_summary_pairs)
+
+ if not (class_attributes := member_type_details.class_attributes):
+ class_attribute_names_pairs = [
+ (
+ f"Are there any public attributes of {class_member}?",
+ f"{class_member} has no public attributes (not starting with '_').",
+ ),
+ (
+ f"Does {class_member} have any public attributes?",
+ f"No, {class_member} does not have any public attributes.",
+ ),
+ (
+ f"Can you tell me if {class_member} has any public attributes?",
+ f"{class_member} does not have any public attributes (not starting with '_').",
+ ),
+ (
+ f"I'm looking for public attributes of {class_member}. Are there any?",
+ f"There are no public attributes (not starting with '_') for {class_member}.",
+ ),
+ (
+ f"Is it possible to find any public attributes in {class_member}?",
+ f"It's not possible to find any public attributes in {class_member}.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(f"{class_member} has no public attributes.")
+ class_member_tuning_pairs.extend(class_attribute_names_pairs)
+ else:
+ class_attributes_count = len(class_attributes)
+ class_attributes_count_pairs = [
+ (
+ f"How many public attributes does {class_member} have?",
+ f"{class_member} has {class_attributes_count} many public attributes.",
+ ),
+ (
+ f"What is the count of public attributes in {class_member}?",
+ f"The count of public attributes in {class_member} is {class_attributes_count}.",
+ ),
+ (
+ f"Could you tell me the number of public attributes in {class_member}?",
+ f"{class_member} has {class_attributes_count} public attributes.",
+ ),
+ (
+ f"Please provide the count of public attributes for {class_member}.",
+ f"Number of public attributes in {class_member} is {class_attributes_count}.",
+ ),
+ (
+ f"Tell me the quantity of public attributes present in {class_member}.",
+ f"{class_member} has {class_attributes_count} public attributes.",
+ ),
+ (
+ f"Would you mind letting me know how many public attributes {class_member}"
+ " contains?",
+ f"{class_member} contains {class_attributes_count} public attributes.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{class_member} has {class_attributes_count} many public attributes."
+ )
+ class_member_tuning_pairs.extend(class_attributes_count_pairs)
+
+ class_public_attributes = enumerate_array_elements(
+ class_attributes, attribute="attribute_name"
+ )
+ class_attribute_names_pairs = [
+ (
+ f"Are there any public attributes of {class_member}?",
+ f"These are the public attributes of {class_member}: {class_public_attributes}.",
+ ),
+ (
+ f"Can you list the public attributes of {class_member}?",
+ f"{class_member} has the following public attributes (not starting with '_'):"
+ f" {class_public_attributes}.",
+ ),
+ (
+ f"What are the public attributes of {class_member}?",
+ f"The public attributes of {class_member} (those not starting with '_') are:"
+ f" {class_public_attributes}.",
+ ),
+ (
+ f"I need to know the public attributes of {class_member}.",
+ f"Sure, the public attributes of {class_member} are: {class_public_attributes}.",
+ ),
+ (
+ f"Could you tell me the public attributes of {class_member}?",
+ f"Of course, public attributes of {class_member} (not starting with '_') are:"
+ f" {class_public_attributes}.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{class_member} has following public attributes: {class_public_attributes}"
+ )
+ class_member_tuning_pairs.extend(class_attribute_names_pairs)
+
+ if not (class_summary := member_type_details.class_summary):
+ class_summary_pairs = [
+ (
+ f"What does {class_member} do in short?",
+ f"Docstring of {class_member} lacks a summary of its objective.",
+ ),
+ (
+ f"Can you briefly explain the function of {class_member}?",
+ f"Docstring of {class_member} doesn't provide a concise summary of its purpose.",
+ ),
+ (
+ f"Could you tell me what {class_member} is used for?",
+ f"Unfortunately, the docstring of {class_member} doesn't contain"
+ " a brief description of its function.",
+ ),
+ (
+ f"I'm not sure what {class_member} does. Can you clarify?",
+ f"The docstring of {class_member} doesn't succinctly explain its role.",
+ ),
+ (
+ f"What's the purpose of {class_member}?",
+ f"Docstring of {class_member} doesn't have any explanation of its objective.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"Unfortunately, {class_member} does not document its objective."
+ )
+ class_member_tuning_pairs.extend(class_summary_pairs)
+ else:
+ class_summary_pairs = [
+ (
+ f"What does {class_member} do in short?",
+ f"Based on documentation, objective of {class_member} is to: '{class_summary}'.",
+ ),
+ (
+ f"Can you briefly explain the function of {class_member}?",
+ f"Sure, according to the documentation, {class_member} is designed to:"
+ f" '{class_summary}'.",
+ ),
+ (
+ f"I'm curious about {class_member}, what's its purpose?",
+ f"Well, as per the documentation, {class_member} aims to: '{class_summary}'.",
+ ),
+ (
+ f"Could you give me a quick rundown on what {class_member} does?",
+ f"Absolutely, the documentation states that the role of {class_member} is to:"
+ f" '{class_summary}'.",
+ ),
+ (
+ f"What's the role of {class_member} in a nutshell?",
+ f"The documentation indicates that the purpose of {class_member} is to:"
+ f" '{class_summary}'.",
+ ),
+ (
+ f"Can you summarise the function of {class_member}?",
+ f"Of course, the documentation outlines that {class_member} is intended to:"
+ f" '{class_summary}'.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"{class_member} documents its purpose as follows: '{class_summary}'."
+ )
+ class_member_tuning_pairs.extend(class_summary_pairs)
+
+ if not (class_notes := member_type_details.class_notes):
+ class_notes_pairs = [
+ (
+ f"Mention any specific details for {class_member} to be aware of.",
+ f"Docstring of {class_member} does not note on specific details.",
+ ),
+ (
+ f"What are the specific details to be aware of for {class_member}?",
+ f"There are no specific details noted in the docstring of {class_member}.",
+ ),
+ (
+ f"Could you tell me any specifics for {class_member} that I should be aware of?",
+ f"The docstring of {class_member} doesn't highlight any details.",
+ ),
+ (
+ f"Are there any specific details for {class_member} that I need to know?",
+ f"No specific details are mentioned in the docstring of {class_member}.",
+ ),
+ (
+ f"I need to know the specific details for {class_member}. Can you provide them?",
+ f"Unfortunately, the docstring of {class_member} does not contain any details.",
+ ),
+ (
+ f"Can you specify any details for {class_member} that I should be aware of?",
+ f"The docstring of {class_member} does not specify any details to be aware of.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"Docstring of {class_member} has contains no specific implementation details."
+ )
+ class_member_tuning_pairs.extend(class_notes_pairs)
+ else:
+ class_notes_pairs = [
+ (
+ f"Mention any specific details for {class_member} to be aware of.",
+ f"The {class_member} docstring highlights the following: '{class_notes}'.",
+ ),
+ (
+ f"What are specifics that I should be aware of before using {class_member}?",
+ f"The details you should know to use {class_member} are highlighted in docstring:"
+ f" '{class_notes}'.",
+ ),
+ (
+ f"Could you specify the details for {class_member} to take note of?",
+ f"Sure, the docstring for {class_member} specifies the following details:"
+ f" '{class_notes}'.",
+ ),
+ (
+ f"Can you list the details for {class_member} to keep in mind?",
+ f"Certainly, the docstring for {class_member} lists the following details:"
+ f" '{class_notes}'.",
+ ),
+ (
+ f"What should users of {class_member} be mindful of?",
+ f"The docstring for {class_member} mentions the following points to be mindful of:"
+ f" '{class_notes}'.",
+ ),
+ (
+ f"What details does the user of {class_member} need to know?",
+ f"User of {class_member} needs to know the following details: '{class_notes}'.",
+ ),
+ ]
+ class_member_retrieval_chunks.append(
+ f"In docstring, {class_member} specifies the following: '{class_notes}'."
+ )
+ class_member_tuning_pairs.extend(class_notes_pairs)
+
+ class_member_dataset = Dataset(
+ retrieval_chunks=class_member_retrieval_chunks[:2], tuning_pairs=class_member_tuning_pairs
+ )
+
+ return class_member_dataset, class_member_retrieval_chunks
+
+
+@pydantic.validate_call(validate_return=True)
+def generate_function_member_dataset( # noqa: C901, PLR0912, PLR0915
+ function_member: str, function_docstring: str, member_type_details: FunctionDetails
+) -> tuple[Dataset, list[str]]:
+ function_member_retrieval_chunks: list[str] = [
+ f"{function_member} is a Python function.",
+ f"{function_member} has following docstring: {function_docstring}.",
+ ]
+ function_member_tuning_pairs: list[tuple[str, str]] = []
+
+ if not (function_parameters := member_type_details.function_parameters):
+ function_parameters_pairs = [
+ (
+ f"List various parameters of {function_member}.",
+ f"{function_member} does not take any parameters.",
+ ),
+ (
+ f"What are the parameters of {function_member}?",
+ f"{function_member} has no parameters.",
+ ),
+ (
+ f"Could you tell me the parameters that {function_member} takes?",
+ f"{function_member} doesn't require any parameters.",
+ ),
+ (
+ f"I need to know the parameters for {function_member}.",
+ f"There are no parameters for {function_member}.",
+ ),
+ (
+ f"Can you list the parameters for {function_member}?",
+ f"Actually, {function_member} doesn't have any parameters.",
+ ),
+ (
+ f"Please provide the parameters of {function_member}.",
+ f"Sorry, but {function_member} does not have any parameters.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(f"{function_member} takes no parameters.")
+ function_member_tuning_pairs.extend(function_parameters_pairs)
+ else:
+ function_parameter_names = enumerate_array_elements(
+ function_parameters, attribute="parameter_details"
+ )
+ function_parameters_pairs = [
+ (
+ f"List various parameters of {function_member}.",
+ f"Different parameters of {function_member} are as follows:"
+ f" {function_parameter_names}.",
+ ),
+ (
+ f"What are the different parameters of {function_member}?",
+ f"{function_member} has the following parameters: {function_parameter_names}.",
+ ),
+ (
+ f"Could you tell me the parameters of {function_member}?",
+ f"Sure, the parameters of {function_member} are: {function_parameter_names}.",
+ ),
+ (
+ f"I need to know the parameters of {function_member}.",
+ f"The parameters of {function_member} are: {function_parameter_names}.",
+ ),
+ (
+ f"Can you list the parameters for {function_member}?",
+ f"Yes, the parameters for {function_member} are: {function_parameter_names}.",
+ ),
+ (
+ f"Please provide the parameters of {function_member}.",
+ f"Parameters of {function_member} are as follows: {function_parameter_names}.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{function_member} takes the following parameters: {function_parameter_names}"
+ )
+ function_member_tuning_pairs.extend(function_parameters_pairs)
+
+ for function_parameter in function_parameters:
+ parameter_name = function_parameter.parameter_name
+ parameter = f"'{parameter_name}' argument in {function_member}"
+
+ if (parameter_default := function_parameter.parameter_default) is inspect._empty:
+ function_parameter_defaults_pairs = [
+ (f"Default value of {parameter}?", f"{parameter} does not have a default value."),
+ (
+ f"What is the default value for {parameter}?",
+ f"The {parameter} does not come with a default value.",
+ ),
+ (
+ f"Could you tell me default value of {parameter}?",
+ f"Sure, the {parameter} does not possess a default value.",
+ ),
+ (
+ f"I'm curious about default value of {parameter}.",
+ f"In response to your curiosity, {parameter} is not assigned a default value.",
+ ),
+ (
+ f"I'd like to know the default value of {parameter}.",
+ f"To answer your query, {parameter} does not hold a default value.",
+ ),
+ (
+ f"Can you inform me about the default value of {parameter}?",
+ f"Certainly, {parameter} does not contain a default value.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(f"{parameter} has no default value.")
+ function_member_tuning_pairs.extend(function_parameter_defaults_pairs)
+ else:
+ function_parameter_defaults_pairs = [
+ (
+ f"Default value of {parameter}?",
+ f"{parameter} has default value of {parameter_default}.",
+ ),
+ (
+ f"What is the default value for {parameter}?",
+ f"The default value for {parameter} is {parameter_default}.",
+ ),
+ (
+ f"Could you tell me default value of {parameter}?",
+ f"Sure, the default value of {parameter} is {parameter_default}.",
+ ),
+ (
+ f"I would like to know the default value of {parameter}.",
+ f"The {parameter} has a default value of {parameter_default}.",
+ ),
+ (
+ f"Can you inform me about the default value of {parameter}?",
+ f"Of course, the {parameter} defaults to {parameter_default}.",
+ ),
+ (
+ f"I'm interested in default value of {parameter}.",
+ f"The default value of the {parameter} is {parameter_default}.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{parameter} has the default value of {parameter_default}."
+ )
+ function_member_tuning_pairs.extend(function_parameter_defaults_pairs)
+
+ if (parameter_annotation := function_parameter.parameter_annotation) is inspect._empty:
+ function_parameter_types_pairs = [
+ (
+ f"What is type annotation of {parameter}?",
+ f"{parameter} does not have a type annotation.",
+ ),
+ (
+ f"Can you tell me type annotation of {parameter}?",
+ f"The {parameter} does not have a type annotation.",
+ ),
+ (
+ f"I'm curious about the type annotation of {parameter}."
+ " Can you provide some information?",
+ f"Sure, the {parameter} does not have a type annotation.",
+ ),
+ (
+ f"Do you have any information on the type annotation of {parameter}?",
+ f"Yes, the {parameter} does not have a type annotation.",
+ ),
+ (
+ f"Could you inform me about the type annotation of {parameter}?",
+ f"Certainly, {parameter} does not have a type annotation.",
+ ),
+ (
+ f"I'd like to know the type annotation of {parameter}.",
+ f"The {parameter} you're asking about does not have a type annotation.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"Unfortunately, type hint for {parameter} is missing."
+ )
+ function_member_tuning_pairs.extend(function_parameter_types_pairs)
+ else:
+ function_parameter_types_pairs = [
+ (
+ f"What is type annotation of {parameter}?",
+ f"Type annotation of {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"Can you tell me type annotation of {parameter}?",
+ f"Sure, the type annotation of {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"I'm curious about the type annotation of {parameter}. What is it?",
+ f"The type annotation of {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"Do you know type annotation of {parameter}?",
+ f"Yes, the type annotation of {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"Could you inform me about the type annotation of {parameter}?",
+ f"Of course, the type annotation of {parameter} is '{parameter_annotation}'.",
+ ),
+ (
+ f"What's the type annotation for {parameter}?",
+ f"The type annotation for {parameter} is '{parameter_annotation}'.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{parameter} has '{parameter_annotation}' as type annotation."
+ )
+ function_member_tuning_pairs.extend(function_parameter_types_pairs)
+
+ if not (parameter_summary := function_parameter.parameter_summary):
+ function_parameter_summary_pairs = [
+ (
+ f"What is {parameter} for?",
+ f"Docstring of {function_member} lacks a description for '{parameter_name}'.",
+ ),
+ (
+ f"Can you explain the purpose of {parameter}?",
+ f"The docstring of {function_member} doesn't provide a description.",
+ ),
+ (
+ f"I'm not sure what {parameter} does. Can you help?",
+ f"Unfortunately, the docstring of {function_member} doesn't include"
+ " a description.",
+ ),
+ (
+ f"Could you clarify the role of {parameter}?",
+ f"The description is missing in the docstring of {function_member}.",
+ ),
+ (
+ f"I'm confused about the {parameter}. What does it do?",
+ f"The docstring of {function_member} doesn't contain a description.",
+ ),
+ (
+ f"What does {parameter} do?",
+ f"There's no description in the docstring of {function_member}.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{parameter} is not documented in the docstring."
+ )
+ function_member_tuning_pairs.extend(function_parameter_summary_pairs)
+ else:
+ function_parameter_summary_pairs = [
+ (
+ f"What is {parameter} for?",
+ f"Based on {function_member} docstring, its role is '{parameter_summary}'.",
+ ),
+ (
+ f"Can you explain the role of {parameter}?",
+ f"Sure, according to the docstring of {function_member},"
+ f" '{parameter_name}' is used for '{parameter_summary}'.",
+ ),
+ (
+ f"I'm curious about the {parameter}. What does it do?",
+ f"Well, if you look at the docstring of {function_member}, you'll see that"
+ f" '{parameter_name}' is responsible for '{parameter_summary}'.",
+ ),
+ (
+ f"Could you tell me the purpose of {parameter}?",
+ f"Of course, the docstring of {function_member} indicates that"
+ f" '{parameter_name}' serves the purpose of '{parameter_summary}'.",
+ ),
+ (
+ f"What's the function of {parameter}?",
+ f"As per the docstring of {function_member}, '{parameter_name}' functions as:"
+ f" '{parameter_summary}'.",
+ ),
+ (
+ f"I'd like to know what '{parameter_name}' does in {function_member}.",
+ f"Sure thing, the docstring of {function_member} states that"
+ f" '{parameter_name}' does '{parameter_summary}'.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"In the docstring, {parameter} is described as '{parameter_summary}'."
+ )
+ function_member_tuning_pairs.extend(function_parameter_summary_pairs)
+
+ if (
+ returns_annotation := member_type_details.function_returns.returns_annotation
+ ) is inspect._empty:
+ function_return_type_pairs = [
+ (
+ f"What is the return type annotation of {function_member}?",
+ f"{function_member} lacks a return type annotation. It may still return though.",
+ ),
+ (
+ f"Can you tell me the return type annotation of {function_member}?",
+ f"The function {function_member} does not have a return type annotation."
+ " However, it may still return.",
+ ),
+ (
+ f"I'm curious about return type annotation of {function_member}. What is it?",
+ f"Well, {function_member} doesn't have a return type annotation."
+ " But, it could still return.",
+ ),
+ (
+ f"Do you know the return type annotation of {function_member}?",
+ f"Actually, {function_member} doesn't come with a return type annotation."
+ " It's possible that it still returns though.",
+ ),
+ (
+ f"Could you inform me about the return type annotation of {function_member}?",
+ f"Sure, {function_member} is missing a return type annotation."
+ " It might still return though.",
+ ),
+ (
+ f"What's the return type annotation for {function_member}?",
+ f"It appears that {function_member} is without a return type annotation."
+ " It may still have a return.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{function_member} has no return annotation, but its return can still be non-null."
+ )
+ function_member_tuning_pairs.extend(function_return_type_pairs)
+ else:
+ function_return_type_pairs = [
+ (
+ f"What is the return type annotation of {function_member}?",
+ f"Return type annotation for {function_member} is '{returns_annotation}'.",
+ ),
+ (
+ f"Can you tell me the return type annotation of {function_member}?",
+ f"Sure, return type annotation for {function_member} is '{returns_annotation}'.",
+ ),
+ (
+ f"I need to know the return type annotation of {function_member}.",
+ f"The return type annotation for {function_member} is '{returns_annotation}'.",
+ ),
+ (
+ f"Do you know the return type annotation of {function_member}?",
+ f"Yes, return type annotation for {function_member} is '{returns_annotation}'.",
+ ),
+ (
+ f"Could you inform me about the return type annotation of {function_member}?",
+ f"Of course, the return type for {function_member} is '{returns_annotation}'.",
+ ),
+ (
+ f"I'm curious about the return type annotation of {function_member}.",
+ f"The return type annotation for {function_member} is '{returns_annotation}'.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"Return of {function_member} is annotated as '{returns_annotation}'."
+ )
+ function_member_tuning_pairs.extend(function_return_type_pairs)
+
+ if not (returns_summary := member_type_details.function_returns.returns_summary):
+ function_return_summary_pairs = [
+ (
+ f"What does {function_member} return?",
+ f"Docstring of {function_member} does not describe its return.",
+ ),
+ (
+ f"Can you tell me what {function_member} returns?",
+ f"Docstring of {function_member} doesn't provide information about its return.",
+ ),
+ (
+ f"Do you know the return of {function_member}?",
+ f"Unfortunately, docstring of {function_member} doesn't specify what it returns.",
+ ),
+ (
+ f"I'm curious about what {function_member} returns. Can you help?",
+ f"I'm sorry, but the docstring of {function_member} doesn't clarify its return.",
+ ),
+ (
+ f"What's the return of {function_member}?",
+ f"The return of {function_member} is not described in its docstring.",
+ ),
+ (
+ f"Could you inform me about the return of {function_member}?",
+ f"Regrettably, the docstring of {function_member} doesn't detail its return.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(f"{function_member} does not document its return.")
+ function_member_tuning_pairs.extend(function_return_summary_pairs)
+ else:
+ function_return_summary_pairs = [
+ (
+ f"What does {function_member} return?",
+ f"Based on {function_member} docstring, the return contains: '{returns_summary}'.",
+ ),
+ (
+ f"Can you tell me what {function_member} returns?",
+ f"Sure, as per docstring of {function_member}, it returns: '{returns_summary}'.",
+ ),
+ (
+ f"I'm curious about what {function_member} returns. Can you help?",
+ f"Absolutely! The docstring of {function_member} indicates that it returns:"
+ f" '{returns_summary}'.",
+ ),
+ (
+ f"Do you know what {function_member} returns?",
+ f"Yes, the docstring of {function_member} states that it returns:"
+ f" '{returns_summary}'.",
+ ),
+ (
+ f"I'd like to know what {function_member} returns.",
+ f"Of course, the docstring of {function_member} reveals that its return contains:"
+ f" '{returns_summary}'.",
+ ),
+ (
+ f"Could you inform me about the return of {function_member}?",
+ f"Certainly, the docstring of {function_member} specifies that it returns:"
+ f" '{returns_summary}'.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"Based on docstring, return of {function_member} is as follows: '{returns_summary}'."
+ )
+ function_member_tuning_pairs.extend(function_return_summary_pairs)
+
+ if not (function_summary := member_type_details.function_summary):
+ function_summary_pairs = [
+ (
+ f"Summarise role of {function_member} in short.",
+ f"{function_member} docstring lacks a summary of its objective.",
+ ),
+ (
+ f"Can you briefly explain the role of {function_member}?",
+ f"The docstring of {function_member} doesn't provide its purpose.",
+ ),
+ (
+ f"What is the purpose of {function_member} as per its docstring?",
+ f"The docstring of {function_member} doesn't clearly state its purpose.",
+ ),
+ (
+ f"Could you provide a summary of objective of {function_member}?",
+ f"The objective of {function_member} is not summarised in its docstring.",
+ ),
+ (
+ f"What does {function_member} do according to its docstring?",
+ f"According to its docstring, role of {function_member} is not summarised.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(f"Documentation for {function_member} is missing.")
+ function_member_tuning_pairs.extend(function_summary_pairs)
+ else:
+ function_summary_pairs = [
+ (
+ f"Summarise role of {function_member} in short.",
+ f"Based on docstring, objective of {function_member} is to: '{function_summary}'.",
+ ),
+ (
+ f"Can you briefly explain the role of {function_member}?",
+ f"Sure, according to the docstring, the purpose of {function_member} is:"
+ f" '{function_summary}'.",
+ ),
+ (
+ f"What does {function_member} do, in a nutshell?",
+ f"In a nutshell, {function_member} is designed to: '{function_summary}'.",
+ ),
+ (
+ f"Could you provide a short summary of role of {function_member}?",
+ f"Certainly, from docstring, {function_member} aims to: '{function_summary}'.",
+ ),
+ (
+ f"I need a brief explanation of what {function_member} does.",
+ f"Of course, {function_member} is intended to: '{function_summary}'.",
+ ),
+ (
+ f"In brief, what is the role of {function_member}?",
+ f"Briefly, the role of {function_member} is to: '{function_summary}',"
+ " according to the docstring.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{function_member} documents itself as follows: '{function_summary}'."
+ )
+ function_member_tuning_pairs.extend(function_summary_pairs)
+
+ if not (function_raises := member_type_details.function_raises):
+ function_raise_types_pairs = [
+ (
+ f"Does {function_member} raise any specific exception?",
+ f"Docstring of {function_member} does not mention any specific exceptions.",
+ ),
+ (
+ f"Are there any specific exceptions that {function_member} raises?",
+ f"No specific exceptions are mentioned in the docstring of {function_member}.",
+ ),
+ (
+ f"Can you tell me if {function_member} raises any specific exceptions?",
+ f"According to docstring, {function_member} does not raise exceptions.",
+ ),
+ (
+ f"I want to know if {function_member} raises any specific exceptions."
+ " Can you confirm?",
+ f"I can confirm that docstring of {function_member} does not mention exceptions.",
+ ),
+ (
+ f"Could {function_member} possibly raise any specific exceptions?",
+ f"The docstring of {function_member} does not indicate that"
+ " it raises any specific exceptions.",
+ ),
+ (
+ f"Is it possible for {function_member} to raise any specific exceptions?",
+ f"The docstring of {function_member} does not suggest that"
+ " it raises any specific exceptions.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{function_member} does not document any specific exceptions in the docstring."
+ )
+ function_member_tuning_pairs.extend(function_raise_types_pairs)
+ else:
+ function_raise_types = enumerate_array_elements(
+ function_raises, attribute="raises_details"
+ )
+ function_raise_types_pairs = [
+ (
+ f"Does {function_member} raise any specific exception?",
+ f"Based on docstring of {function_member}, it can raise the following:"
+ f" {function_raise_types}.",
+ ),
+ (
+ f"Can you tell me if {function_member} raises any specific exceptions?",
+ f"Yes, according to docstring of {function_member}, it can raise these exceptions:"
+ f" {function_raise_types}.",
+ ),
+ (
+ f"What exceptions, if any, does {function_member} raise?",
+ f"{function_member} can raise these exceptions as per its docstring:"
+ f" {function_raise_types}.",
+ ),
+ (
+ f"I need to know if {function_member} throws any specific exceptions."
+ " Can you help?",
+ f"Sure, {function_member} can throw following exceptions according to docstring:"
+ f" {function_raise_types}.",
+ ),
+ (
+ f"Could you inform me about any specific exceptions that"
+ f" {function_member} might raise?",
+ f"Certainly, the docstring of {function_member} indicates that"
+ f" it can raise these exceptions: {function_raise_types}.",
+ ),
+ (
+ f"I'm curious about the exceptions that {function_member} might throw."
+ " Do you have any information?",
+ f"Yes, the docstring of {function_member} suggests that"
+ f" it can throw the following exceptions: {function_raise_types}.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"From docstring, {function_member} can raise the following: {function_raise_types}"
+ )
+ function_member_tuning_pairs.extend(function_raise_types_pairs)
+
+ if not (function_warns := member_type_details.function_warns):
+ function_warn_types_pairs = [
+ (
+ f"Does {function_member} throw any specific warnings?",
+ f"Docstring of {function_member} lacks any mention of specific warnings.",
+ ),
+ (
+ f"Are there any specific warnings that {function_member} throws?",
+ f"There are no specific warnings mentioned in docstring of {function_member}.",
+ ),
+ (
+ f"Can you tell me if {function_member} throws any specific warnings?",
+ f"According to the docstring of {function_member},"
+ " it doesn't throw any specific warnings.",
+ ),
+ (
+ f"I want to know if {function_member} throws any specific warnings."
+ " Can you help?",
+ f"Sure, I checked the docstring of {function_member} and"
+ " found no mention of specific warnings.",
+ ),
+ (
+ f"Could you check if {function_member} throws any specific warnings?",
+ f"I've checked the docstring of {function_member} and"
+ " it doesn't mention any specific warnings.",
+ ),
+ (
+ f"Is it possible that {function_member} throws any specific warnings?",
+ f"Based on the docstring of {function_member},"
+ " it doesn't seem to throw any specific warnings.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"Mention of any warnings is missing in docstring of {function_member}."
+ )
+ function_member_tuning_pairs.extend(function_warn_types_pairs)
+ else:
+ function_warn_types = enumerate_array_elements(function_warns, attribute="warns_details")
+ function_warn_types_pairs = [
+ (
+ f"Does {function_member} throw any specific warnings?",
+ f"Based on the docstring, {function_member} can throw the following warnings:"
+ f" {function_warn_types}.",
+ ),
+ (
+ f"Can you tell me if {function_member} throws any specific warnings?",
+ f"Yes, according to docstring, {function_member} may throw these warnings:"
+ f" {function_warn_types}.",
+ ),
+ (
+ f"I'm curious, does {function_member} generate any particular warnings?",
+ f"Indeed, docstring indicates that {function_member} can generate these warnings:"
+ f" {function_warn_types}.",
+ ),
+ (
+ f"What specific warnings, if any, does {function_member} throw?",
+ f"{function_member} throws the following warnings as per the docstring:"
+ f" {function_warn_types}.",
+ ),
+ (
+ f"Could {function_member} possibly throw any specific warnings?",
+ f"Yes, it could. Docstring of {function_member} mentions these specific warnings:"
+ f" {function_warn_types}.",
+ ),
+ (
+ f"Are there any specific warnings that {function_member} throws?",
+ f"Yes, there are. The docstring for {function_member} lists following warnings:"
+ f" {function_warn_types}.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{function_member} documents the following warnings: {function_warn_types}"
+ )
+ function_member_tuning_pairs.extend(function_warn_types_pairs)
+
+ if not (function_notes := member_type_details.function_notes):
+ function_notes_pairs = [
+ (
+ f"Is there any specific details for {function_member} to be aware of?",
+ f"Docstring of {function_member} lacks any notes on specific details.",
+ ),
+ (
+ f"Are there any particular details I should know about {function_member}?",
+ f"There are no specific details noted in the docstring of {function_member}.",
+ ),
+ (
+ f"What should I be aware of when using {function_member}?",
+ f"The docstring of {function_member} does not contain any details to be aware of.",
+ ),
+ (
+ f"Could you tell me if there are any specific details for {function_member}?",
+ f"No specific details are mentioned in the docstring of {function_member}.",
+ ),
+ (
+ f"I'm curious if there are any specific details about {function_member}?",
+ f"The docstring of {function_member} does not provide any specific details.",
+ ),
+ (
+ f"Do I need to be aware of any specific details for {function_member}?",
+ f"The docstring of {function_member} does not include any specific details.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{function_member} has no specific notes in the docstring."
+ )
+ function_member_tuning_pairs.extend(function_notes_pairs)
+ else:
+ function_notes_pairs = [
+ (
+ f"Is there any specific details for {function_member} to be aware of?",
+ f"Docstring of {function_member} highlights the following: '{function_notes}'.",
+ ),
+ (
+ f"What should I know about {function_member}?",
+ "You should be aware that docstring includes the following details:"
+ f" '{function_notes}'.",
+ ),
+ (
+ f"Could you provide some details about {function_member}?",
+ f"Sure, the docstring of {function_member} provides the following information:"
+ f" '{function_notes}'.",
+ ),
+ (
+ f"What are the important details of {function_member}?",
+ f"The important details of {function_member} are highlighted in its docstring:"
+ f" '{function_notes}'.",
+ ),
+ (
+ f"Can you tell me more about {function_member}?",
+ f"Of course, the docstring of {function_member} contains the following details:"
+ f" '{function_notes}'.",
+ ),
+ (
+ f"I need information about {function_member}.",
+ f"The docstring of {function_member} contains the following information:"
+ f" '{function_notes}'.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"Docstring for {function_member} has following notes: '{function_notes}'."
+ )
+ function_member_tuning_pairs.extend(function_notes_pairs)
+
+ if not (function_references := member_type_details.function_references):
+ function_references_pairs = [
+ (
+ f"Is there any reference for {function_member}?",
+ f"Documentation for {function_member} contains no references.",
+ ),
+ (
+ f"Can I find any references in the documentation for {function_member}?",
+ f"No, the documentation for {function_member} does not contain any references.",
+ ),
+ (
+ f"Does the documentation for {function_member} include any references?",
+ f"No, there are no references in the documentation for {function_member}.",
+ ),
+ (
+ f"Are there references available in the {function_member} documentation?",
+ f"No, the {function_member} documentation does not include any references.",
+ ),
+ (
+ f"I'm looking for references in {function_member} documentation. Are there any?",
+ f"Unfortunately, the documentation for {function_member} contains no references.",
+ ),
+ (
+ f"Could you tell me if there are any references for {function_member}?",
+ f"I'm sorry, but documentation for {function_member} lacks any references.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{function_member} documents no references in its docstring."
+ )
+ function_member_tuning_pairs.extend(function_references_pairs)
+ else:
+ function_references_pairs = [
+ (
+ f"Is there any reference for {function_member}?",
+ f"The docstring links the following: '{function_references}'.",
+ ),
+ (
+ f"Can you provide a reference for {function_member}?",
+ f"Sure, the docstring provides the following reference: '{function_references}'.",
+ ),
+ (
+ f"Where can I find a reference for {function_member}?",
+ f"You can find it in the docstring, which links to: '{function_references}'.",
+ ),
+ (
+ f"Could you point me to the reference for {function_member}?",
+ f"Of course, the docstring points to these reference: '{function_references}'.",
+ ),
+ (
+ f"I'm looking for a reference for {function_member}. Can you help?",
+ f"Absolutely, the docstring links to this reference: '{function_references}'.",
+ ),
+ (
+ f"What's the reference for {function_member}?",
+ f"The reference for that is in the docstring: '{function_references}'.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"{function_member} list the following references: {function_references}"
+ )
+ function_member_tuning_pairs.extend(function_references_pairs)
+
+ if not (function_examples := member_type_details.function_examples):
+ function_examples_pairs = [
+ (
+ f"Is there any example for {function_member}?",
+ f"Docstring for {function_member} lacks any examples.",
+ ),
+ (
+ f"Can I find an example for {function_member} in the docstring?",
+ f"Unfortunately, docstring for {function_member} does not contain any examples.",
+ ),
+ (
+ f"Does the docstring for {function_member} include any examples?",
+ f"No, the docstring for {function_member} does not include any examples.",
+ ),
+ (
+ f"I'm looking for an example of {function_member} in docstring, is there one?",
+ f"I'm sorry, but docstring for {function_member} does not provide any examples.",
+ ),
+ (
+ f"Are there any examples provided in the docstring for {function_member}?",
+ f"No examples are provided in the docstring for {function_member}.",
+ ),
+ (
+ f"Could you tell me if there's an example for {function_member} in docstring?",
+ f"I regret to inform you that {function_member} documents no examples.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"Documentation of {function_member} lacks any examples."
+ )
+ function_member_tuning_pairs.extend(function_examples_pairs)
+ else:
+ function_examples_pairs = [
+ (
+ f"Is there any example for {function_member}?",
+ f"Documentation of {function_member} contains these examples:"
+ f" '{function_examples}'.",
+ ),
+ (
+ f"Can you provide an example of {function_member}?",
+ f"Sure, you can find examples of {function_member} in its documentation:"
+ f" '{function_examples}'.",
+ ),
+ (
+ f"I'm looking for examples of {function_member}, can you help?",
+ f"Absolutely, examples for {function_member} are available in its documentation:"
+ f" '{function_examples}'.",
+ ),
+ (
+ f"Where can I find examples for {function_member}?",
+ f"You can find examples for {function_member} in its documentation:"
+ f" '{function_examples}'.",
+ ),
+ (
+ f"Could you show me some examples of {function_member}?",
+ f"Of course, the documentation of {function_member} includes these examples:"
+ f" '{function_examples}'.",
+ ),
+ (
+ f"I need examples for {function_member}, where can I find them?",
+ f"You can find examples for {function_member} in its documentation:"
+ f" '{function_examples}'.",
+ ),
+ ]
+ function_member_retrieval_chunks.append(
+ f"Docstring of {function_member} contains following examples: '{function_examples}'."
+ )
+ function_member_tuning_pairs.extend(function_examples_pairs)
+
+ function_member_dataset = Dataset(
+ retrieval_chunks=function_member_retrieval_chunks[:2],
+ tuning_pairs=function_member_tuning_pairs,
+ )
+
+ return function_member_dataset, function_member_retrieval_chunks
+
+
+@pydantic.validate_call(validate_return=True)
+def generate_member_dataset(member_details: MemberDetails) -> tuple[Dataset, ...]:
+ member_name = member_details.member_name
+ member_full_name = member_details.member_qualified_name
+ member = f"'{member_name}' object"
+
+ member_retrieval_chunks: list[str] = []
+ member_tuning_pairs: list[tuple[str, str]] = []
+
+ module_parent_pairs = [
+ (
+ f"What is the parent module of {member}?",
+ f"'{member_details.member_module}' is the name of its parent module.",
+ ),
+ (
+ f"Can you tell me the parent module of {member}?",
+ f"Sure, the parent module of {member} is '{member_details.member_module}'.",
+ ),
+ (
+ f"I'm trying to find the parent module of {member}, can you help?",
+ f"Of course, parent module of {member} is '{member_details.member_module}'.",
+ ),
+ (
+ f"Do you know the parent module of {member}?",
+ f"Yes, the parent module of {member} is '{member_details.member_module}'.",
+ ),
+ (
+ f"I need to know the parent module of {member}, can you provide that?",
+ f"Absolutely, parent module of {member} is '{member_details.member_module}'.",
+ ),
+ (
+ f"Could you inform me about the parent module of {member}?",
+ f"Certainly, '{member_details.member_module}' is parent module of {member}.",
+ ),
+ ]
+ member_retrieval_chunks.append(
+ f"{member} is part of parent module {member_details.member_module}."
+ )
+ member_tuning_pairs.extend(module_parent_pairs)
+
+ member_full_name_pairs = [
+ (
+ f"What is the full name of {member}?",
+ f"'{member_full_name}' is its fully qualified name.",
+ ),
+ (
+ f"Can you tell me the full name of the {member}?",
+ f"Sure, the fully qualified name of {member} is '{member_full_name}'.",
+ ),
+ (
+ f"I need to know the full name of {member}. Can you help?",
+ f"Of course, the full name of {member} is '{member_full_name}'.",
+ ),
+ (
+ f"What's the fully qualified name for the {member}?",
+ f"The fully qualified name for {member} is '{member_full_name}'.",
+ ),
+ (
+ f"Could you provide the full name of the {member}?",
+ f"Certainly, the full name of the {member} is '{member_full_name}'.",
+ ),
+ (
+ f"I'm looking for the full name of {member}. What is it?",
+ f"The full name of {member} is '{member_full_name}'.",
+ ),
+ ]
+ member_retrieval_chunks.append(f"Full name of {member} is '{member_full_name}'.")
+ member_tuning_pairs.extend(member_full_name_pairs)
+
+ member_hierarchy = enumerate_array_elements(member_details.member_hierarchy)
+ member_hierarchy_pairs = [
+ (
+ f"What is the hierarchy of {member}?",
+ f"The hierarchy of {member} is as follows: {member_hierarchy}.",
+ ),
+ (
+ f"Can you explain the hierarchy of the {member}?",
+ f"Sure, the hierarchy of the {member} is: {member_hierarchy}.",
+ ),
+ (
+ f"Could you tell me the hierarchy of {member}?",
+ f"Of course, the hierarchy of {member} is: {member_hierarchy}.",
+ ),
+ (
+ f"I would like to know the hierarchy of {member}. Can you provide that?",
+ f"Absolutely, the hierarchy of {member} is: {member_hierarchy}.",
+ ),
+ (
+ f"Please provide the hierarchy of {member}.",
+ f"The hierarchy of {member} is: {member_hierarchy}.",
+ ),
+ (
+ f"I'm interested in the hierarchy of {member}. Could you share it?",
+ f"Sure, the hierarchy of {member} is: {member_hierarchy}.",
+ ),
+ ]
+ member_retrieval_chunks.append(f"Hierarchy of {member} is as follows: {member_hierarchy}.")
+ member_tuning_pairs.extend(member_hierarchy_pairs)
+
+ if not (member_docstring := member_details.member_docstring):
+ member_documentation_pairs = [
+ (
+ f"What is the documentation of {member}?",
+ f"{member} does not have any documentation.",
+ ),
+ (
+ f"Can you provide the documentation for the {member}?",
+ f"Sorry, the {member} does not have any documentation.",
+ ),
+ (
+ f"Is there any documentation available for the {member}?",
+ f"No, there is no documentation available for the {member}.",
+ ),
+ (
+ f"Could you show me the documentation of the {member}?",
+ f"Unfortunately, the {member} does not have any documentation.",
+ ),
+ (
+ f"I'm looking for the documentation of {member}. Can you help?",
+ f"I'm sorry, but the {member} does not have any documentation.",
+ ),
+ ]
+ member_retrieval_chunks.append(
+ f"Unfortunately, {member} currently does not have any documentation."
+ )
+ member_tuning_pairs.extend(member_documentation_pairs)
+ else:
+ member_documentation_pairs = [
+ (f"What does {member} do?", f"Its documentation is as follows: '{member_docstring}'."),
+ (
+ f"Can you explain the function of the {member}?",
+ f"Sure, here is its documentation: '{member_docstring}'.",
+ ),
+ (
+ f"I'm not sure what {member} does. Can you clarify?",
+ f"Of course, here's its documentation for clarification: '{member_docstring}'.",
+ ),
+ (
+ f"Could you tell me about the {member}?",
+ f"Certainly, its documentation is: '{member_docstring}'.",
+ ),
+ (
+ f"I need information on the {member}.",
+ f"Here's the documentation you need: '{member_docstring}'.",
+ ),
+ (
+ f"What's the purpose of the {member}?",
+ f"The purpose is described in its documentation: '{member_docstring}'.",
+ ),
+ ]
+ member_retrieval_chunks.append(
+ f"The following is the documentation of {member}: '{member_docstring}'."
+ )
+ member_tuning_pairs.extend(member_documentation_pairs)
+
+ if (member_type_details := member_details.member_type_details) is not None:
+ member_type = member_type_details.member_type
+
+ member_type_pairs = [
+ (f"What is the type of {member}?", f"{member} is of '{member_type.value}' type."),
+ (
+ f"Can you tell me the type of the {member}?",
+ f"Sure, the {member} is of '{member_type.value}' type.",
+ ),
+ (
+ f"I would like to know the type of {member}. Can you help?",
+ f"Absolutely, the {member} is of '{member_type.value}' type.",
+ ),
+ (
+ f"Do you know the type of {member}?",
+ f"Yes, the {member} is of '{member_type.value}' type.",
+ ),
+ (
+ f"Could you inform me about the type of {member}?",
+ f"Of course, the {member} is of '{member_type.value}' type.",
+ ),
+ (
+ f"I'm curious about type of {member}. Can you provide some information?",
+ f"Certainly, the {member} is of '{member_type.value}' type.",
+ ),
+ ]
+ member_retrieval_chunks.insert(-1, f"'{member_name}' is a Python {member_type.value}.")
+ member_tuning_pairs.extend(member_type_pairs)
+
+ if member_type_details is None:
+ member_retrieval_chunks.insert(0, f"'{member_name}' is a Python object.")
+
+ member_dataset = Dataset(
+ retrieval_chunks=member_retrieval_chunks, tuning_pairs=member_tuning_pairs
+ )
+
+ return (member_dataset,)
+
+ match member_type:
+ case MemberType.ENUM:
+ member_type_dataset, member_type_retrieval_chunks = generate_enum_member_dataset(
+ f"'{member_name}' enum", member_docstring, member_type_details
+ )
+ case MemberType.CLASS:
+ member_type_dataset, member_type_retrieval_chunks = generate_class_member_dataset(
+ f"'{member_name}' class", member_docstring, member_type_details
+ )
+ case MemberType.FUNCTION:
+ member_type_dataset, member_type_retrieval_chunks = generate_function_member_dataset(
+ f"'{member_name}' function", member_docstring, member_type_details
+ )
+ case _:
+ LOGGER.critical(f"Received unsupported {member_type=}")
+
+ raise ValueError("Unexpected member type: supports 'enum', 'class', 'function'")
+
+ member_dataset = Dataset(
+ retrieval_chunks=member_retrieval_chunks + member_type_retrieval_chunks,
+ tuning_pairs=member_tuning_pairs,
+ )
+
+ return (member_dataset, member_type_dataset)
+
+
+__all__ = [
+ "enumerate_array_elements",
+ "generate_class_member_dataset",
+ "generate_enum_member_dataset",
+ "generate_function_member_dataset",
+ "generate_member_dataset",
+ "generate_module_dataset",
+ "generate_package_dataset",
+]
diff --git a/src/generative_ai/dataset_generation/utils_generation.py b/src/generative_ai/dataset_generation/utils_generation.py
new file mode 100644
index 0000000..3f2d662
--- /dev/null
+++ b/src/generative_ai/dataset_generation/utils_generation.py
@@ -0,0 +1,211 @@
+import enum
+import functools
+import typing
+
+import pydantic
+
+
+class Package(pydantic.BaseModel):
+ package_name: str
+ package_qualified_name: str
+ package_hierarchy: list[str]
+ parent_package_name: str | None
+ children_sub_packages_names: list[str]
+ children_modules_names: list[str]
+ package_summary: str | None = None
+ package_all_exports: list[str] | None = None
+
+
+class ModuleMember(pydantic.BaseModel):
+ member_name: str
+ member_object: typing.Any
+
+
+class Module(pydantic.BaseModel):
+ module_name: str
+ module_qualified_name: str
+ module_hierarchy: list[str]
+ package_name: str
+ module_members: list[ModuleMember]
+ module_summary: str | None = None
+ module_all_exports: list[str] | None = None
+
+
+class MemberType(str, enum.Enum):
+ ENUM = "enum"
+ CLASS = "class"
+ FUNCTION = "function"
+
+
+class EnumMember(pydantic.BaseModel):
+ enum_member_name: str
+ enum_member_value: typing.Any
+
+ @pydantic.computed_field
+ @functools.cached_property
+ def enum_member(self: "EnumMember") -> str:
+ return f"{self.enum_member_name} (corresponding to '{self.enum_member_value}')"
+
+
+class EnumDetails(pydantic.BaseModel):
+ member_type: typing.Literal[MemberType.ENUM]
+ enum_members: list[EnumMember]
+
+
+class Parameter(pydantic.BaseModel):
+ parameter_name: str
+ parameter_default: typing.Any
+ parameter_annotation: typing.Any
+ parameter_kind: str
+ parameter_summary: str | None = None
+
+ @pydantic.computed_field
+ @functools.cached_property
+ def parameter_details(self: "Parameter") -> str:
+ return f"'{self.parameter_name}', of type '{self.parameter_kind}'"
+
+
+class Method(pydantic.BaseModel):
+ method_name: str
+ method_parameters: list[str]
+ method_summary: str | None = None
+
+
+class Attribute(pydantic.BaseModel):
+ attribute_name: str
+
+
+class ClassDetails(pydantic.BaseModel):
+ member_type: typing.Literal[MemberType.CLASS]
+ class_parameters: list[Parameter]
+ class_methods: list[Method]
+ class_attributes: list[Attribute]
+ class_summary: str | None = None
+ class_notes: str | None = None
+
+
+class Returns(pydantic.BaseModel):
+ returns_annotation: typing.Any
+ returns_summary: str | None = None
+
+
+class Raises(pydantic.BaseModel):
+ raises_type: str | None = None
+ raises_summary: str | None = None
+
+ @pydantic.computed_field
+ @functools.cached_property
+ def raises_details(self: "Raises") -> str:
+ return f"'{self.raises_type}' ('{self.raises_summary}')"
+
+
+class Warns(pydantic.BaseModel):
+ warns_type: str | None = None
+ warns_summary: str | None = None
+
+ @pydantic.computed_field
+ @functools.cached_property
+ def warns_details(self: "Warns") -> str:
+ return f"'{self.warns_type}' ('{self.warns_summary}')"
+
+
+class FunctionDetails(pydantic.BaseModel):
+ member_type: typing.Literal[MemberType.FUNCTION]
+ function_parameters: list[Parameter]
+ function_returns: Returns
+ function_summary: str | None = None
+ function_raises: list[Raises] | None = None
+ function_warns: list[Warns] | None = None
+ function_notes: str | None = None
+ function_references: str | None = None
+ function_examples: str | None = None
+
+
+class MemberDetails(pydantic.BaseModel):
+ member_name: str
+ member_qualified_name: str
+ member_hierarchy: list[str]
+ member_module: str
+ member_docstring: str
+ member_type_details: EnumDetails | ClassDetails | FunctionDetails | None = pydantic.Field(
+ default=None, discriminator="member_type"
+ )
+
+
+class Document(pydantic.BaseModel):
+ context: str
+ question: str
+ answer: str
+
+ @pydantic.computed_field
+ @functools.cached_property
+ def instruction_with_context(self: "Document") -> str:
+ system_instruction = (
+ "Below is a question that can be answered using the following context. "
+ "Write an answer for the question appropriately without using any additional data."
+ )
+
+ return " ".join(
+ [
+ "",
+ f"[INST] {system_instruction} [/INST]",
+ f"[INST] Context: {self.context} [/INST]",
+ f"[INST] Question: {self.question} [/INST]",
+ f"[INST] Answer: {self.answer} [/INST]",
+ "",
+ ]
+ )
+
+ @pydantic.computed_field
+ @functools.cached_property
+ def instruction_without_context(self: "Document") -> str:
+ return f"[INST] {self.question} [/INST] {self.answer} "
+
+
+class Dataset(pydantic.BaseModel):
+ retrieval_chunks: list[str]
+ tuning_pairs: list[tuple[str, str]]
+
+ @pydantic.computed_field
+ @functools.cached_property
+ def tuning_documents(self: "Dataset") -> list[Document]:
+ return [
+ Document(context=" ".join(self.retrieval_chunks), question=question, answer=answer)
+ for question, answer in self.tuning_pairs
+ ]
+
+
+class JSONDocument(pydantic.BaseModel):
+ context: str
+ question: str
+ answer: str
+ instruction_with_context: str
+ instruction_without_context: str
+
+
+class JSONDataset(pydantic.BaseModel):
+ retrieval_documents: list[str]
+ tuning_documents: list[JSONDocument]
+
+
+__all__ = [
+ "Attribute",
+ "ClassDetails",
+ "Dataset",
+ "Document",
+ "EnumDetails",
+ "EnumMember",
+ "FunctionDetails",
+ "JSONDataset",
+ "JSONDocument",
+ "MemberDetails",
+ "MemberType",
+ "Method",
+ "Module",
+ "ModuleMember",
+ "Package",
+ "Parameter",
+ "Raises",
+ "Returns",
+ "Warns",
+]
diff --git a/src/generative_ai/fine_tuning/step_1_tuning.ipynb b/src/generative_ai/fine_tuning/step_1_tuning.ipynb
new file mode 100644
index 0000000..3397a05
--- /dev/null
+++ b/src/generative_ai/fine_tuning/step_1_tuning.ipynb
@@ -0,0 +1,7589 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HHEHKsv3g2l8",
+ "outputId": "f09b6240-b150-4eca-af1f-4d0f858e81dd",
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118\n",
+ "Collecting accelerate (from -r requirements.txt (line 1))\n",
+ " Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m265.7/265.7 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting bitsandbytes (from -r requirements.txt (line 2))\n",
+ " Downloading bitsandbytes-0.41.3-py3-none-any.whl (92.6 MB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting datasets (from -r requirements.txt (line 3))\n",
+ " Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m52.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting peft (from -r requirements.txt (line 4))\n",
+ " Downloading peft-0.7.0-py3-none-any.whl (168 kB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m168.3/168.3 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 5)) (0.4.1)\n",
+ "Collecting torch (from -r requirements.txt (line 6))\n",
+ " Downloading https://download.pytorch.org/whl/cu118/torch-2.1.1%2Bcu118-cp310-cp310-linux_x86_64.whl (2325.9 MB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m2.3/2.3 GB\u001b[0m \u001b[31m539.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 7)) (4.35.2)\n",
+ "Collecting trl (from -r requirements.txt (line 8))\n",
+ " Downloading trl-0.7.4-py3-none-any.whl (133 kB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m133.9/133.9 kB\u001b[0m \u001b[31m20.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate->-r requirements.txt (line 1)) (1.23.5)\n",
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate->-r requirements.txt (line 1)) (23.2)\n",
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate->-r requirements.txt (line 1)) (5.9.5)\n",
+ "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate->-r requirements.txt (line 1)) (6.0.1)\n",
+ "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate->-r requirements.txt (line 1)) (0.19.4)\n",
+ "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (9.0.0)\n",
+ "Collecting pyarrow-hotfix (from datasets->-r requirements.txt (line 3))\n",
+ " Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+ "Collecting dill<0.3.8,>=0.3.0 (from datasets->-r requirements.txt (line 3))\n",
+ " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (1.5.3)\n",
+ "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (2.31.0)\n",
+ "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (4.66.1)\n",
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (3.4.1)\n",
+ "Collecting multiprocess (from datasets->-r requirements.txt (line 3))\n",
+ " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m20.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (2023.6.0)\n",
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (3.9.1)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->-r requirements.txt (line 6)) (3.13.1)\n",
+ "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch->-r requirements.txt (line 6)) (4.5.0)\n",
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->-r requirements.txt (line 6)) (1.12)\n",
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->-r requirements.txt (line 6)) (3.2.1)\n",
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->-r requirements.txt (line 6)) (3.1.2)\n",
+ "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch->-r requirements.txt (line 6)) (2.1.0)\n",
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->-r requirements.txt (line 7)) (2023.6.3)\n",
+ "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers->-r requirements.txt (line 7)) (0.15.0)\n",
+ "Collecting tyro>=0.5.11 (from trl->-r requirements.txt (line 8))\n",
+ " Downloading tyro-0.6.0-py3-none-any.whl (100 kB)\n",
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m100.9/100.9 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (23.1.0)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (6.0.4)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (1.9.3)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (1.4.0)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (1.3.1)\n",
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (4.0.3)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->-r requirements.txt (line 3)) (3.3.2)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->-r requirements.txt (line 3)) (3.6)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->-r requirements.txt (line 3)) (2.0.7)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->-r requirements.txt (line 3)) (2023.11.17)\n",
+ "Collecting docstring-parser>=0.14.1 (from tyro>=0.5.11->trl->-r requirements.txt (line 8))\n",
+ " Downloading docstring_parser-0.15-py3-none-any.whl (36 kB)\n",
+ "Requirement already satisfied: rich>=11.1.0 in /usr/local/lib/python3.10/dist-packages (from tyro>=0.5.11->trl->-r requirements.txt (line 8)) (13.7.0)\n",
+ "Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl->-r requirements.txt (line 8))\n",
+ " Downloading shtab-1.6.5-py3-none-any.whl (13 kB)\n",
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->-r requirements.txt (line 6)) (2.1.3)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->-r requirements.txt (line 3)) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->-r requirements.txt (line 3)) (2023.3.post1)\n",
+ "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->-r requirements.txt (line 6)) (1.3.0)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets->-r requirements.txt (line 3)) (1.16.0)\n",
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=11.1.0->tyro>=0.5.11->trl->-r requirements.txt (line 8)) (3.0.0)\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=11.1.0->tyro>=0.5.11->trl->-r requirements.txt (line 8)) (2.16.1)\n",
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=11.1.0->tyro>=0.5.11->trl->-r requirements.txt (line 8)) (0.1.2)\n",
+ "Installing collected packages: bitsandbytes, shtab, pyarrow-hotfix, docstring-parser, dill, torch, multiprocess, tyro, accelerate, datasets, trl, peft\n",
+ " Attempting uninstall: torch\n",
+ " Found existing installation: torch 2.1.0+cu118\n",
+ " Uninstalling torch-2.1.0+cu118:\n",
+ " Successfully uninstalled torch-2.1.0+cu118\n",
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "torchaudio 2.1.0+cu118 requires torch==2.1.0, but you have torch 2.1.1+cu118 which is incompatible.\n",
+ "torchdata 0.7.0 requires torch==2.1.0, but you have torch 2.1.1+cu118 which is incompatible.\n",
+ "torchtext 0.16.0 requires torch==2.1.0, but you have torch 2.1.1+cu118 which is incompatible.\n",
+ "torchvision 0.16.0+cu118 requires torch==2.1.0, but you have torch 2.1.1+cu118 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[0mSuccessfully installed accelerate-0.25.0 bitsandbytes-0.41.3 datasets-2.15.0 dill-0.3.7 docstring-parser-0.15 multiprocess-0.70.15 peft-0.7.0 pyarrow-hotfix-0.6 shtab-1.6.5 torch-2.1.1+cu118 trl-0.7.4 tyro-0.6.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "!python3 -m pip install \\\n",
+ " --upgrade \\\n",
+ " --requirement requirements.txt \\\n",
+ " --constraint constraints.txt \\\n",
+ " --extra-index-url https://download.pytorch.org/whl/cu118"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "_x4RfffVg8Nw",
+ "outputId": "b0022811-1c7d-413c-9eb0-e6830f672c51",
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/trl/trainer/ppo_config.py:141: UserWarning: The `optimize_cuda_cache` arguement will be deprecated soon, please use `optimize_device_cache` instead.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "import gc\n",
+ "import json\n",
+ "import pathlib\n",
+ "import shutil\n",
+ "\n",
+ "import datasets\n",
+ "import peft\n",
+ "import torch\n",
+ "import transformers\n",
+ "import trl"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "rNebA6JYg8Lo",
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "raw_dataset_path = pathlib.Path(\"json_documents.json\")\n",
+ "base_model_identifier = \"HuggingFaceH4/zephyr-7b-beta\"\n",
+ "tuning_checkpoint_directory = pathlib.Path(\"zephyr_tuning_checkpoints_directory\")\n",
+ "tuned_adapter_directory = pathlib.Path(\"tuned_zephyr_adapter_directory\")\n",
+ "tuned_adapter_archive = \"tuned_zephyr_adaptr_archive\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "id": "T2ksh4jcg8Jj",
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "with raw_dataset_path.open(encoding=\"utf-8\") as file_object:\n",
+ " raw_dataset = json.load(file_object)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "id": "CMkyFv9og8HX",
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "hugging_face_dataset = datasets.Dataset.from_list(raw_dataset[\"tuning_documents\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "b3pjFXj6g8Fb",
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "quantisation_configuration = transformers.BitsAndBytesConfig(\n",
+ " load_in_4bit=True,\n",
+ " bnb_4bit_compute_dtype=\"float16\",\n",
+ " bnb_4bit_quant_type=\"nf4\",\n",
+ " bnb_4bit_use_double_quant=False,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 433,
+ "referenced_widgets": [
+ "07df370aa2c243e285d2fc79a85c7fde",
+ "73699ca649c64464b238ff37d07915de",
+ "6dae7824e19042cdb52f9fff4655b2f4",
+ "9f44f076d9814ec0b5d00462f68db4b1",
+ "176b326bab79416296b128b64a74dffa",
+ "b5d6ffe8087e4241bfd9ce456dd7dc98",
+ "80f588df8be74093aa2220e2ec56b8ac",
+ "ac5b0d2c6b3e4f01971c0d88fda1ea00",
+ "6f382477db344113874777c653befe95",
+ "1c9d1a1a20e7428e8132cc0e12e1d503",
+ "c8d699e0eb154de88591c21570e1ab2c",
+ "9420b28ead574ec0818ad38d673a4a22",
+ "181e18b7222a4eaa901585883c60d29c",
+ "cb2bd1a572fc4bfb8b1836ae91e316c6",
+ "fb841e8cd6cf42658bd2a4278ff5fd77",
+ "07bedd16874a4f5687394c4f936261e2",
+ "2951f44f15c24bfdba4ef2a85dc778f9",
+ "b787e05c120947ce98f98641b8f6c45c",
+ "4145a60168564eaeb658809ad6b7aa4e",
+ "389adb8bfbbd40a4977ba41b02fc1739",
+ "ee78a285765d4530b208759a100ab35b",
+ "cff8de1e75e5447e908fc49aac16f017",
+ "0ab5311fbafb41fbb1a734b95765e00a",
+ "456d154a13d349caaacbca90447cc4a8",
+ "2c3e05971c9b4534bf597735e3b3355e",
+ "cde82cb73bc84263a4b9b786f5f777f6",
+ "66e8bd5cec7d45bc853eb4a3ae281de4",
+ "919ff6a0fe2c475eab19f87cc0970cfc",
+ "03749a64798b469bb0adab1cb618db67",
+ "4a2a2c053de048129bbdb20bb2977b9b",
+ "3bbcaf28992f4117a7438a6e2ed66e8d",
+ "a43fc313187640308b924d4248b45a38",
+ "f196b24df94440a0974fe2545ec5d404",
+ "a6e30cba8ec74f4a90c08a42477be348",
+ "823ff0863e6f4f51bc4e375111503d0f",
+ "25e20296adf540198ec4771b1152bb8d",
+ "c33b4b12128d4cddbf36301090e7ee1c",
+ "5c7b7311020a49c48e4d50972958c15b",
+ "646981baa186423394f94417ffb548c4",
+ "e3f627d0f035427abe93035ca0e067db",
+ "209cc282d5d74e5493cb4c0e59324759",
+ "d11e8b4a270b44f9bae001984b5adad7",
+ "67d743a7bc7c4b64ad6d3e42485d195a",
+ "a025d05c8114473a8f0ee38ec2d880d9",
+ "3917a2598c5c42bd9681a005338f34d7",
+ "127d9fab6a6943febce137004b078ca3",
+ "e828c38645c941edaa4f1538f16f8ad6",
+ "c028548c75fb4b24b8222a92e521d551",
+ "7c0d798503ed414a99888deef2bae663",
+ "f5655604a6124cb8affe4e578ef3a9d0",
+ "0fe4ad29e1ec407abda1439318fd1e49",
+ "2c6c89f1df7e43c088d4da49d54c3492",
+ "d347864fa320455f9d65183b900b1ebe",
+ "4b1c76e2b72c4d2c9fd6fd86ac73cbf0",
+ "2b6aa6918b1c46108dfae2306128d394",
+ "48b72b62f2284c3d81a76a223ad00b1a",
+ "f172e4ca01eb4214a5a86afaa3f630d0",
+ "b06327d175d44b6bab631b8f88619d49",
+ "68b54843defa4fe8b633449749b6cdde",
+ "ace42011a97546a7bfd44633636db3fa",
+ "74ff0965efee4d07bd7d8354820bf3e0",
+ "4a4d7d40743c4dc1ab34c6164348e6c5",
+ "d9496c940e9d482ba3c0381901d39040",
+ "800152f817dc4fb6ba532af84822af52",
+ "a62bebfe702d4b4cb2c036620672c5cc",
+ "4f75f65c676b4e218f85cad935752aba",
+ "fd30b1b0665e4029a67b79eff69bddad",
+ "1d74948585a949be9e43f4eab670a5e3",
+ "1349cc1f194c4059a01d766e92e1f496",
+ "a04ff0580a844e218a1c324f958007f0",
+ "c50e954dcae84a5aae06a69701808f8e",
+ "61722cf185fc4d2b8df5f3e721ec4e96",
+ "2068f3ca95e141d08f6ccf80344d0eb7",
+ "dc501cc569f24be4909d460c2982f601",
+ "07e2b43d832f41f8bd2f31b2c7ab1db9",
+ "e0e653ebaea14dbdb058ca323db1bc48",
+ "2463289661c8473bb279d40a067fb1d3",
+ "e52212e786544c91b390c358c8c0b1a9",
+ "1017bddf6aa2491a934f843762e4d769",
+ "ae8a13b81b9542828834f6f91dc7881b",
+ "b083574c95e247b597381725b4500d5c",
+ "d7fd1545ebf24ea7be7c4d3580468ca1",
+ "23af875260eb4f6e9cfa58924f9e0f84",
+ "070196bfff7a40e1955542b07e928e61",
+ "5d244b03d8fe423ebc86055fa075de54",
+ "d119089cb25c47ad8f2819a28bbb8bca",
+ "03418c74c9b343a7a15df8e6a5c06037",
+ "af54328406854e4e891dfd04beef9806",
+ "c644623691084e0b881144652c0f31ec",
+ "44672c499e0d48d6a4ce0e088770e625",
+ "ec885745589f486891fe25deeb8c9312",
+ "46416efdf6c54664af4247147a1d4eb6",
+ "499cff098f5744baa8fe1250a94854b0",
+ "c4be9b1eb3a74fa29db1c1ddaa3d6b93",
+ "c38cc65dd83146d1aec861c4404225e2",
+ "e4f1915bbe5a493aadd4b20a2eeccb65",
+ "391bf170d0864495b9b52040d83a0c8d",
+ "df58cafb63c842b193687188c752e92b",
+ "cb289455571640b0b48c7759427de84d",
+ "6ae56fbadf4d488c97bc8ce0b1439786",
+ "e09f5d2f37fe45a4bde7a745b36b3c9c",
+ "4cc42006fef547c58655e09b6b2ce478",
+ "ddf873b21b1b41d9978d7b6b050e65d0",
+ "6760e13ff3c94ef89f2cff167ca82e7b",
+ "c539bffd517e477aa28fbfed3fdf0eaa",
+ "108ee1f4f0ab4d948ef7296e5dd25c7c",
+ "aa2ce018e85648cb98aa512b8f783719",
+ "35c52a25cb074d3eadc25ae08556bb5d",
+ "4b0ac607c0284a898a7dd672ba377c29",
+ "15bede68f8644745a75a7ad4ef31ea93",
+ "1250961292b44114bd23911d87f65747",
+ "4ab90f7385d4424c8f82e4d264845251",
+ "a287d589b85c49eb9f4fb6711a919d5e",
+ "d812f7765f2a4f79ade8b695b622364a",
+ "fb465bc9def94e00aedc6c2df69fe30b",
+ "aed95fb1f3984cf897e5b303e909ecbe",
+ "4915287dbe5b4a4a856c65dbf51c1229",
+ "9804c0ca28bd4617a5ef9494794d2520",
+ "9346ec32d8e0421f9120c4368553f570",
+ "63cac8d6dc7f4715b46ae5ba74c2e397",
+ "a6831777a9a341d7ad73799cfb53d785",
+ "a0051fa0a8584767bc36b0afd13b4386",
+ "0482f2abec544cecb4e93071ac70c802",
+ "ee4492e67b994a4db203c8be36118bc1",
+ "7d441115bec446ae9adb907d21ef75f4",
+ "a8127adfdead454d8d840cf8853b870f",
+ "c2bc67eecbd94864903b4eb2b5cb576b",
+ "0e90e19e3337489995a25f15e05e5393",
+ "19f56057ee7642d1ac110a972c81c4cb",
+ "3dfe38aaefe848df9fcc117def07c9f1",
+ "f7249bbc1e9b4641aecb3796cce19dd5",
+ "8e1f5a6b8a21451e937ef590772aec68",
+ "927fb91a51214d5fa8df5191f5c9d396",
+ "353fead6d5494fe4a5d63a6d7ad3f14c",
+ "52d66e638fd743d5a8c8a0f84ff8e819",
+ "924d96d4d3524fd8a4ebffc4d1adde33",
+ "ce4b4ee3ae534ab59e58b62e582a74a5",
+ "7ee4fe49c59e439c95518943c4da73a4",
+ "042f95f828334688afa4386d61dfb752",
+ "013b29ba4baa45789963aded99aa72d7",
+ "f6de121d8ace4fdf89c1ced1bdf41047",
+ "8ba77dc2c6ae4babbb8a37e2b7ec0d87",
+ "3ff7083836c340928dda9d09a4fc2ac8"
+ ]
+ },
+ "id": "LkkX2R07g8DF",
+ "outputId": "d388644a-4ae8-4ed3-fe51-61c94b034713",
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "07df370aa2c243e285d2fc79a85c7fde",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "config.json: 0%| | 0.00/638 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9420b28ead574ec0818ad38d673a4a22",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model.safetensors.index.json: 0%| | 0.00/23.9k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0ab5311fbafb41fbb1a734b95765e00a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading shards: 0%| | 0/8 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a6e30cba8ec74f4a90c08a42477be348",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00001-of-00008.safetensors: 0%| | 0.00/1.89G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "3917a2598c5c42bd9681a005338f34d7",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00002-of-00008.safetensors: 0%| | 0.00/1.95G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "48b72b62f2284c3d81a76a223ad00b1a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00003-of-00008.safetensors: 0%| | 0.00/1.98G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fd30b1b0665e4029a67b79eff69bddad",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00004-of-00008.safetensors: 0%| | 0.00/1.95G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "e52212e786544c91b390c358c8c0b1a9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00005-of-00008.safetensors: 0%| | 0.00/1.98G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c644623691084e0b881144652c0f31ec",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00006-of-00008.safetensors: 0%| | 0.00/1.95G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "6ae56fbadf4d488c97bc8ce0b1439786",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00007-of-00008.safetensors: 0%| | 0.00/1.98G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "1250961292b44114bd23911d87f65747",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00008-of-00008.safetensors: 0%| | 0.00/816M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a0051fa0a8584767bc36b0afd13b4386",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Loading checkpoint shards: 0%| | 0/8 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "927fb91a51214d5fa8df5191f5c9d396",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "generation_config.json: 0%| | 0.00/111 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "model = transformers.AutoModelForCausalLM.from_pretrained(\n",
+ " base_model_identifier, quantization_config=quantisation_configuration, device_map={\"\": 0}\n",
+ ")\n",
+ "model.config.use_cache = False\n",
+ "model.config.pretraining_tp = 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 177,
+ "referenced_widgets": [
+ "d6574b9913e449ae9cd7a072994367c0",
+ "02cb78b27b484ad6966182619e29fd54",
+ "97ddbbcc782e4af09c2c84e52b6f95fb",
+ "fbbc1565972c413cbc29543bdd8900d0",
+ "00dc4e0838b7405a9717ae1eaba4f742",
+ "a7164b6aed6a4543b06a30c9d6ff2723",
+ "9f25f3443c3245239d046250bf76317a",
+ "9a46cccba7004d5e991bffa680796f85",
+ "b4c678188f7c479a884597aa76ebc2c2",
+ "332be8ac435844a29dc48a66e37d7cfa",
+ "b7ba3c43942c442fb397422c67a43b4c",
+ "f4061b6242754cf38bf27971e41c3f4c",
+ "015725f756ac43bc892bc3a7046ca372",
+ "6ee0263bb70f42ca95e283d731708b94",
+ "79592715da7f4d5b9f766d32499bbf3a",
+ "e7156242d7b24330a01da48362d4d187",
+ "7613fa817cbb440fa29ca93f8ae47a90",
+ "1b11c05f35cc455eafd1aabfa86d46b1",
+ "48a9f43772fb44c690137928a5c8b0e3",
+ "25b72b95cf2745be9bfe8a047085df99",
+ "d89561c94ecf414d80a24f4aeba289cf",
+ "346a06cd9e1840ea8362143b697bc645",
+ "bcef83603618409a96baf42d7db14b3d",
+ "1b4b4871db8e45b5ba018ec41466ba28",
+ "92d95391d06c4bcc89d3517e1111dc28",
+ "d991ee4630e042bf8e03d8a1db3ea4b8",
+ "6571f6b5b0784f06bae68a31124e2a30",
+ "7c6de2b186944f1ea92237bdb3d13e16",
+ "971fda8bbd214dbcb23c047b2dda39e8",
+ "4c5d58753f584144baf64cf52de93add",
+ "5b7e083e4b8240859d3b0127fa42289d",
+ "6698637e8a794ebf897fe033221018ac",
+ "46474d2e15a44c588791b6a21479e6a5",
+ "fcb0306afa004784803ed769fda4df8c",
+ "ed3dbc72f4804fb2950f6649e5ea4b75",
+ "692ef916c3af458f8907fa28e6697796",
+ "053fa2533d3f4ba7ae6081470f18795b",
+ "33729e9ab63b4e6fa4cb9f537f06dede",
+ "3f3ddd1da98e44729947e1f17cccff75",
+ "f229c20327574a91bfe9fec859c128c0",
+ "73640b0a6d9d4cbcaba58bd91fdf5b80",
+ "ea1a245aab5a4d46adc7b55492899432",
+ "2dd8db9b09304fb891131df193c639ac",
+ "997364118d5c4b57af4509cbf2398d06",
+ "263044aacfbc452c9a185f356b899f88",
+ "b3a876a45a2e4ac893fc9f311294a886",
+ "a7bc6a0a0a4345cca62b0db408198d36",
+ "3b6758bbf1d04599b583e80c594a73ee",
+ "8a80afee56684a6082bded8c6fd0cabb",
+ "df663d20a7834c21823923d859cb26dc",
+ "c3dc9543ec424d57ab067e5bc9c8182b",
+ "3633a1ca2f0d4cd3a2df9985c8bc3caa",
+ "a7565a528e4e4ddaa321e67c28e531a1",
+ "5f7c82759e8649f48d008cce3ac7ff96",
+ "4fb7adf0ecb94787ad35b05149854c02"
+ ]
+ },
+ "id": "si_oPGOBg8Av",
+ "outputId": "99a3caac-1955-45f5-f9fe-699655c95f87",
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d6574b9913e449ae9cd7a072994367c0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer_config.json: 0%| | 0.00/1.43k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f4061b6242754cf38bf27971e41c3f4c",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer.model: 0%| | 0.00/493k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "bcef83603618409a96baf42d7db14b3d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer.json: 0%| | 0.00/1.80M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fcb0306afa004784803ed769fda4df8c",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "added_tokens.json: 0%| | 0.00/42.0 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "263044aacfbc452c9a185f356b899f88",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "special_tokens_map.json: 0%| | 0.00/168 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "tokeniser = transformers.AutoTokenizer.from_pretrained(base_model_identifier)\n",
+ "tokeniser.pad_token = tokeniser.eos_token\n",
+ "tokeniser.padding_side = \"right\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "A4qbH3khg7-j",
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "peft_configuration = peft.LoraConfig(\n",
+ " r=8, lora_alpha=16, lora_dropout=0.1, bias=\"none\", task_type=peft.TaskType.CAUSAL_LM\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "Gr9T2wyog78S",
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "peft_model = peft.get_peft_model(model, peft_configuration)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "id": "Pb12YV8pp2cr",
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "training_configuration = transformers.TrainingArguments(\n",
+ " output_dir=tuning_checkpoint_directory,\n",
+ " per_device_train_batch_size=4,\n",
+ " gradient_accumulation_steps=1,\n",
+ " learning_rate=1e-3,\n",
+ " weight_decay=0.001,\n",
+ " max_grad_norm=0.3,\n",
+ " num_train_epochs=5,\n",
+ " max_steps=-1,\n",
+ " lr_scheduler_type=transformers.SchedulerType.COSINE,\n",
+ " warmup_ratio=0.03,\n",
+ " save_strategy=\"epoch\",\n",
+ " save_safetensors=True,\n",
+ " use_cpu=False,\n",
+ " seed=0,\n",
+ " data_seed=0,\n",
+ " bf16=False,\n",
+ " fp16=True,\n",
+ " optim=\"paged_adamw_32bit\",\n",
+ " group_by_length=True,\n",
+ " report_to=\"none\",\n",
+ " auto_find_batch_size=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 104,
+ "referenced_widgets": [
+ "0200690821004c53b8a823d6a80be6d2",
+ "218e487e641845fab0f156731f3f27de",
+ "06282dd869984966b6b63596c0277f5f",
+ "537fc9864935444080a1471c3306ffa9",
+ "08142a29c33a424999e895e211d4cffe",
+ "dfe26c04317046f98e348be1a8a5a3f7",
+ "a0d0db553d28462e8b299f2871c28c12",
+ "6dec6a1d8ec44e978fc1ded152f68da9",
+ "51131685f20e4b5cab193e0661b82389",
+ "e17571f7aa6e4d1ab9424822bd7e3c2e",
+ "02cc17e2fc1546e8887b696b1155cc4e"
+ ]
+ },
+ "id": "lSJ0u9cHp2aL",
+ "outputId": "0e459867-d674-42a8-c49d-3f632e3a27be",
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py:194: UserWarning: You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to 1024\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0200690821004c53b8a823d6a80be6d2",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Map: 0%| | 0/2854 [00:00, ? examples/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "supervised_trainer = trl.SFTTrainer(\n",
+ " model=peft_model,\n",
+ " args=training_configuration,\n",
+ " train_dataset=hugging_face_dataset,\n",
+ " tokenizer=tokeniser,\n",
+ " dataset_text_field=\"instruction_without_context\",\n",
+ " packing=False,\n",
+ " max_seq_length=None,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 604
+ },
+ "id": "3o9wuWlmp2X7",
+ "outputId": "595532de-ef85-417c-bd64-5db6eef28052",
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
Step | \n", + "Training Loss | \n", + "
---|---|
500 | \n", + "0.985400 | \n", + "
1000 | \n", + "0.780300 | \n", + "
1500 | \n", + "0.822400 | \n", + "
2000 | \n", + "0.814500 | \n", + "
2500 | \n", + "0.760100 | \n", + "
3000 | \n", + "0.752200 | \n", + "
3500 | \n", + "0.582600 | \n", + "
4000 | \n", + "0.522300 | \n", + "
4500 | \n", + "0.429100 | \n", + "
5000 | \n", + "0.359600 | \n", + "
5500 | \n", + "0.314100 | \n", + "
6000 | \n", + "0.251300 | \n", + "
6500 | \n", + "0.242300 | \n", + "
7000 | \n", + "0.222400 | \n", + "
"
+ ],
+ "text/plain": [
+ "