diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 17ec9443..232ea364 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,6 +25,7 @@ jobs: python -m pip install --upgrade pip pip install . pip install .[postgres] + pip install .[bigquery] pip install pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with pytest diff --git a/notebooks/vn-connect-to-bigquery.ipynb b/notebooks/vn-connect-to-bigquery.ipynb new file mode 100644 index 00000000..5ca96996 --- /dev/null +++ b/notebooks/vn-connect-to-bigquery.ipynb @@ -0,0 +1,532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Vanna AI](https://img.vanna.ai/vanna-ask.svg)\n", + "\n", + "The following notebook goes through the process of connecting to your gcs using bigquery connector and running sql queries usig Vanna AI. For demo purpose we are using one the google's example queries.\n", + "\n", + "# Install Vanna\n", + "First we install Vanna from [PyPI](https://pypi.org/project/vanna/) and import it.\n", + "Here, we'll install vanna with extra postgres. If you're using a different database, you'll need to install the appropriate extras." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install vanna[bigquery]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import vanna as vn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Database Connection\n", + "These details are only referenced within your notebook. These database credentials are never sent to Vanna's severs.\n", + "You need to set:\n", + "`PROJECT_ID`.\n", + "in your environment. By default vanna will look for the pre-set google ADC but if it is not pre-set, you'll also need to provide service account credentials json file path. we can also provide parameters `cred_file_path` `project_id` to the method." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not using Google Colab.\n" + ] + } + ], + "source": [ + "vn.connect_to_bigquery()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "or if you are using service account you can provide credentials json file path:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "cred_file_path = \"provide/creds/path.json\"\n", + "vn.connect_to_bigquery(cred_file_path=cred_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Note: \n", + "You need to add `PROJECT_ID` to your environment or you can pass it as `project_id` in above method as a parameter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get Results\n", + "This gets the SQL run it and prints it's result as a dataframe. Note that we use your provided credentials to execute the SQL on your warehouse from your local instance. Your connection nor your data gets sent to Vanna's servers. For more info on how Vanna works, [see this post](https://medium.com/vanna-ai/how-vanna-works-how-to-train-it-data-security-8d8f2008042). We will be using google demo sql below. Note that below SQL only works for the google demo data." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
submitter_idcase_iddiag__treat__countprimary_sitedisease_typeproj__nameproj__project_iddemo__demographic_iddemo__genderdemo__race...exp__bmiexp__years_smokedexp__pack_years_smokedexp__cigarettes_per_dayexp__alcohol_historyexp__stateexp__created_datetimeexp__updated_datetimestateupdated_datetime
0TCGA-CN-5363291b069c-9dde-4e1e-8430-85146bc943382LarynxSquamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSC2611cb61-6d05-5286-b94a-ce6cac2ba37bmaleblack or african american...NaNNaN15.00.821918YesreleasedNone2019-07-31T18:43:25.167078-05:00released2019-08-06T14:25:25.511101-05:00
1TCGA-CN-53654cffea0b-90a7-4c86-a73f-bb8feca3ada72TonsilSquamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSC97a7f69b-0f40-5450-bbeb-92084a100a9dmalewhite...NaNNaN26.01.424658YesreleasedNone2019-07-31T19:39:51.442671-05:00released2019-08-06T14:25:25.511101-05:00
2TCGA-CN-A642a1ded1e8-eb28-49dd-8f3d-1ce8f40eed8f2Other and unspecified parts of tongueSquamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSC4bc58619-47fc-5c2d-aaec-9d9e562e049bmalewhite...NaNNaN5.00.273973YesreleasedNone2019-07-31T19:30:27.901248-05:00released2019-08-06T14:25:39.854271-05:00
3TCGA-CR-738053b254b7-021f-43df-af9b-3fc01b87479e2Other and ill-defined sites in lip, oral cavit...Squamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSCbe41a712-ebee-52e1-907c-80b1917daa45malewhite...NaNNaNNaNNaNYesreleasedNone2019-07-31T19:40:20.032260-05:00released2019-08-06T14:26:05.315718-05:00
4TCGA-CV-5978e16e9535-b20f-4c9a-8b5b-82df80c994482LarynxSquamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSC92d1d967-c8a0-52cb-a62d-1d11bdf85068femaleblack or african american...NaNNaNNaNNaNYesreleasedNone2019-07-31T19:52:06.976359-05:00released2019-08-06T14:26:05.315718-05:00
5TCGA-CV-6948fcf0dc48-b889-4593-a15b-aa715aae7bf52Floor of mouthSquamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSC2fd1a926-7584-50d5-b6b7-9b9d02710f47femalewhite...NaNNaNNaNNaNNoreleasedNone2019-07-31T19:55:16.152855-05:00released2019-08-06T14:26:16.536997-05:00
6TCGA-CV-7409acd98e20-d2da-4256-99a5-13e261bc88e62Other and ill-defined sites in lip, oral cavit...Squamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSC2a3f5bb4-3606-5549-8d85-ec413eadd7abmaleblack or african american...NaNNaNNaNNaNNoreleasedNone2019-07-31T19:48:25.311492-05:00released2019-08-06T14:26:28.608672-05:00
7TCGA-CV-A6JUb1b3983d-37d2-4bef-bd17-708e3e6001462Other and unspecified parts of tongueSquamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSC604e3dac-30be-589d-b622-df0b41cd9a7ffemalewhite...NaNNaN81.04.438356YesreleasedNone2019-07-31T19:48:40.594893-05:00released2019-08-06T14:26:39.780396-05:00
8TCGA-QK-A6IHc1f286f6-d4a1-494a-88c8-ff8e2a3df2ce2GumSquamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSC83e5c705-bd2e-5516-9700-ed3803dde268femalewhite...NaNNaNNaNNaNYesreleasedNone2019-07-31T19:49:42.057478-05:00released2019-08-06T14:27:02.392779-05:00
9TCGA-QK-A8Z8ac511727-185b-4ac0-b6c0-dc3a79657be62LarynxSquamous Cell NeoplasmsHead and Neck Squamous Cell CarcinomaTCGA-HNSCfd1e46fb-43bb-54ae-b713-a579ba857ed4femaleblack or african american...NaNNaN80.04.383562YesreleasedNone2019-07-31T19:48:22.125112-05:00released2019-08-06T14:27:02.392779-05:00
\n", + "

10 rows × 70 columns

\n", + "
" + ], + "text/plain": [ + " submitter_id case_id diag__treat__count \\\n", + "0 TCGA-CN-5363 291b069c-9dde-4e1e-8430-85146bc94338 2 \n", + "1 TCGA-CN-5365 4cffea0b-90a7-4c86-a73f-bb8feca3ada7 2 \n", + "2 TCGA-CN-A642 a1ded1e8-eb28-49dd-8f3d-1ce8f40eed8f 2 \n", + "3 TCGA-CR-7380 53b254b7-021f-43df-af9b-3fc01b87479e 2 \n", + "4 TCGA-CV-5978 e16e9535-b20f-4c9a-8b5b-82df80c99448 2 \n", + "5 TCGA-CV-6948 fcf0dc48-b889-4593-a15b-aa715aae7bf5 2 \n", + "6 TCGA-CV-7409 acd98e20-d2da-4256-99a5-13e261bc88e6 2 \n", + "7 TCGA-CV-A6JU b1b3983d-37d2-4bef-bd17-708e3e600146 2 \n", + "8 TCGA-QK-A6IH c1f286f6-d4a1-494a-88c8-ff8e2a3df2ce 2 \n", + "9 TCGA-QK-A8Z8 ac511727-185b-4ac0-b6c0-dc3a79657be6 2 \n", + "\n", + " primary_site disease_type \\\n", + "0 Larynx Squamous Cell Neoplasms \n", + "1 Tonsil Squamous Cell Neoplasms \n", + "2 Other and unspecified parts of tongue Squamous Cell Neoplasms \n", + "3 Other and ill-defined sites in lip, oral cavit... Squamous Cell Neoplasms \n", + "4 Larynx Squamous Cell Neoplasms \n", + "5 Floor of mouth Squamous Cell Neoplasms \n", + "6 Other and ill-defined sites in lip, oral cavit... Squamous Cell Neoplasms \n", + "7 Other and unspecified parts of tongue Squamous Cell Neoplasms \n", + "8 Gum Squamous Cell Neoplasms \n", + "9 Larynx Squamous Cell Neoplasms \n", + "\n", + " proj__name proj__project_id \\\n", + "0 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "1 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "2 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "3 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "4 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "5 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "6 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "7 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "8 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "9 Head and Neck Squamous Cell Carcinoma TCGA-HNSC \n", + "\n", + " demo__demographic_id demo__gender \\\n", + "0 2611cb61-6d05-5286-b94a-ce6cac2ba37b male \n", + "1 97a7f69b-0f40-5450-bbeb-92084a100a9d male \n", + "2 4bc58619-47fc-5c2d-aaec-9d9e562e049b male \n", + "3 be41a712-ebee-52e1-907c-80b1917daa45 male \n", + "4 92d1d967-c8a0-52cb-a62d-1d11bdf85068 female \n", + "5 2fd1a926-7584-50d5-b6b7-9b9d02710f47 female \n", + "6 2a3f5bb4-3606-5549-8d85-ec413eadd7ab male \n", + "7 604e3dac-30be-589d-b622-df0b41cd9a7f female \n", + "8 83e5c705-bd2e-5516-9700-ed3803dde268 female \n", + "9 fd1e46fb-43bb-54ae-b713-a579ba857ed4 female \n", + "\n", + " demo__race ... exp__bmi exp__years_smoked \\\n", + "0 black or african american ... NaN NaN \n", + "1 white ... NaN NaN \n", + "2 white ... NaN NaN \n", + "3 white ... NaN NaN \n", + "4 black or african american ... NaN NaN \n", + "5 white ... NaN NaN \n", + "6 black or african american ... NaN NaN \n", + "7 white ... NaN NaN \n", + "8 white ... NaN NaN \n", + "9 black or african american ... NaN NaN \n", + "\n", + " exp__pack_years_smoked exp__cigarettes_per_day exp__alcohol_history \\\n", + "0 15.0 0.821918 Yes \n", + "1 26.0 1.424658 Yes \n", + "2 5.0 0.273973 Yes \n", + "3 NaN NaN Yes \n", + "4 NaN NaN Yes \n", + "5 NaN NaN No \n", + "6 NaN NaN No \n", + "7 81.0 4.438356 Yes \n", + "8 NaN NaN Yes \n", + "9 80.0 4.383562 Yes \n", + "\n", + " exp__state exp__created_datetime exp__updated_datetime \\\n", + "0 released None 2019-07-31T18:43:25.167078-05:00 \n", + "1 released None 2019-07-31T19:39:51.442671-05:00 \n", + "2 released None 2019-07-31T19:30:27.901248-05:00 \n", + "3 released None 2019-07-31T19:40:20.032260-05:00 \n", + "4 released None 2019-07-31T19:52:06.976359-05:00 \n", + "5 released None 2019-07-31T19:55:16.152855-05:00 \n", + "6 released None 2019-07-31T19:48:25.311492-05:00 \n", + "7 released None 2019-07-31T19:48:40.594893-05:00 \n", + "8 released None 2019-07-31T19:49:42.057478-05:00 \n", + "9 released None 2019-07-31T19:48:22.125112-05:00 \n", + "\n", + " state updated_datetime \n", + "0 released 2019-08-06T14:25:25.511101-05:00 \n", + "1 released 2019-08-06T14:25:25.511101-05:00 \n", + "2 released 2019-08-06T14:25:39.854271-05:00 \n", + "3 released 2019-08-06T14:26:05.315718-05:00 \n", + "4 released 2019-08-06T14:26:05.315718-05:00 \n", + "5 released 2019-08-06T14:26:16.536997-05:00 \n", + "6 released 2019-08-06T14:26:28.608672-05:00 \n", + "7 released 2019-08-06T14:26:39.780396-05:00 \n", + "8 released 2019-08-06T14:27:02.392779-05:00 \n", + "9 released 2019-08-06T14:27:02.392779-05:00 \n", + "\n", + "[10 rows x 70 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql=\"\"\"SELECT *\n", + "FROM `isb-cgc-bq.TCGA_versioned.clinical_gdc_r24`\n", + "LIMIT 10\"\"\"\n", + "vn.run_sql(sql=sql)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run as a Web App\n", + "If you would like to use this functionality in a web app, you can deploy the Vanna Streamlit app and use your own secrets. See [this repo](https://github.com/vanna-ai/vanna-streamlit)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index da7797db..c0b59d45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,6 @@ dependencies = [ [project.optional-dependencies] postgres = ["psycopg2", "db-dtypes"] -bigqury = ["google-cloud-bigquery"] +bigquery = ["google-auth", "google-cloud-bigquery"] snowflake = ["snowflake-connector-python"] test = ["tox"] diff --git a/src/vanna/__init__.py b/src/vanna/__init__.py index 7013c750..ae3c06ab 100644 --- a/src/vanna/__init__.py +++ b/src/vanna/__init__.py @@ -408,7 +408,7 @@ def set_model(model: str): model = env_model else: raise ValidationError("Please replace 'my-model' with the name of your model") - dataset = sanitize_model_name(model) + _set_org(org=model) @@ -1636,3 +1636,86 @@ def run_sql_postgres(sql: str) -> Union[pd.DataFrame, None]: global run_sql run_sql = run_sql_postgres + + +def connect_to_bigquery(cred_file_path: str = None, project_id: str = None): + """ + Connect to gcs using the bigquery connector. This is just a helper function to set [`vn.run_sql`][vanna.run_sql] + **Example:** + ```python + import bigquery.Client + vn.connect_to_bigquery( + project_id="myprojectid", + cred_file_path="path/to/credentials.json", + ) + ``` + Args: + project_id (str): The gcs project id. + cred_file_path (str): The gcs credential file path + """ + + try: + from google.api_core.exceptions import GoogleAPIError + from google.cloud import bigquery + from google.oauth2 import service_account + except ImportError: + raise DependencyError("You need to install required dependencies to execute this method, run command:" + " \npip install vanna[bigquery]") + + if not project_id: + project_id = os.getenv('PROJECT_ID') + + if not project_id: + raise ImproperlyConfigured("Please set your Google Cloud Project ID.") + + import sys + if "google.colab" in sys.modules: + try: + from google.colab import auth + auth.authenticate_user() + except Exception as e: + raise ImproperlyConfigured(e) + else: + print("Not using Google Colab.") + + conn = None + + try: + conn = bigquery.Client() + except: + print("Could not found any google cloud implicit credentials") + + if cred_file_path: + # Validate file path and pemissions + validate_config_path(cred_file_path) + else: + if not conn: + raise ValidationError("Pleae provide a service account credentials json file") + + if not conn: + with open(cred_file_path, 'r') as f: + credentials = service_account.Credentials.from_service_account_info( + json.loads(f.read()), + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) + + try: + conn = bigquery.Client(project=project_id, credentials=credentials) + except: + raise ImproperlyConfigured("Could not connect to bigquery please correct credentials") + + def run_sql_bigquery(sql: str) -> Union[pd.DataFrame, None]: + if conn: + try: + job = conn.query(sql) + df = job.result().to_dataframe() + return df + except GoogleAPIError as error: + errors = [] + for error in error.errors: + errors.append(error["message"]) + raise errors + return None + + global run_sql + run_sql = run_sql_bigquery diff --git a/tests/test_vanna.py b/tests/test_vanna.py index 5c3f95bf..37bb92ac 100644 --- a/tests/test_vanna.py +++ b/tests/test_vanna.py @@ -3,6 +3,8 @@ import sys import io import pandas as pd +import contextlib +import stat import os import pytest from vanna.exceptions import ValidationError, ImproperlyConfigured @@ -431,3 +433,67 @@ def test_connect_to_postgres_validations(monkeypatch, params, none_param): with pytest.raises(ImproperlyConfigured) as exc: vn.connect_to_postgres(**params) assert f"Please set your postgres {none_param}" in exc.args[0] + + +class Client: + def query(self, query): + + pass + + +@pytest.mark.parametrize("params", [ + dict(project_id=None), +]) +def test_connect_to_bigquery_validations(monkeypatch, params): + monkeypatch.setattr("google.cloud.bigquery.Client", Client) + with pytest.raises(ImproperlyConfigured) as exc: + vn.connect_to_bigquery(**params) + assert "Please set your Google Cloud Project ID." in exc.args[0] + + +@pytest.mark.parametrize("params, expected_err", [ + ( + dict( + project_id="test-project", + cred_file_path="wrong/file/path.json" + ), + "No such configuration file: wrong/file/path.json" + ), + ( + dict( + project_id="test-project", + cred_file_path="tests" + ), + "Config should be a file: tests" + ) +]) +def test_connect_to_bigquery_creds_path_validations(monkeypatch, params, expected_err): + monkeypatch.setattr("google.cloud.bigquery.Client", Client) + with pytest.raises(ImproperlyConfigured) as exc: + vn.connect_to_bigquery(**params) + assert expected_err in exc.args[0] + + +@pytest.mark.parametrize("params", [ + dict( + project_id="test-project", + cred_file_path="tests/test-creds.json" + ), +]) +def test_connect_to_bigquery_creds_file_permissions(monkeypatch, params): + monkeypatch.setattr("google.cloud.bigquery.Client", Client) + with create_file(params["cred_file_path"]) as creds_path: + with pytest.raises(ImproperlyConfigured) as exc: + vn.connect_to_bigquery(**params) + assert f"Cannot read the config file. Please grant read privileges: {creds_path}" in exc.args[0] + + +@contextlib.contextmanager +def create_file(file_path): + with open(file_path, "w") as f: + pass + os.chmod(file_path, stat.S_IWUSR) + try: + yield file_path + finally: + os.remove(file_path)