From 4c767cd82a4e62aee7f66b0e28b5638700bb64b9 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 15 May 2024 14:00:30 -0300 Subject: [PATCH] updated youtube transcript video package --- package.json | 6 +- pnpm-lock.yaml | 245 ++++++++++++++++-- src/__tests__/providers/YouTube/index.test.ts | 2 +- src/providers/WebScraper/single_url.ts | 4 +- src/providers/YouTube/index.ts | 7 +- 5 files changed, 240 insertions(+), 24 deletions(-) diff --git a/package.json b/package.json index 8f7f141..0654dbd 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/data-connectors", - "version": "0.0.49-beta.6", + "version": "0.0.50", "description": "Data connectors for LLMs. Made by Mendable.ai", "main": "dist/index.js", "module": "dist/index.mjs", @@ -46,6 +46,7 @@ "@babel/preset-typescript": "^7.23.3", "@types/dotenv": "^8.2.0", "@types/fluent-ffmpeg": "^2.1.24", + "@types/he": "^1.2.3", "@types/jest": "^29.5.11", "@types/pdf-parse": "^1.1.4", "@types/xml2js": "^0.4.14", @@ -74,6 +75,7 @@ "form-data": "^4.0.0", "glob": "^10.3.10", "googleapis": "^131.0.0", + "he": "^1.2.0", "jira.js": "^3.0.2", "mammoth": "^1.6.0", "node-html-parser": "^6.1.12", @@ -87,7 +89,7 @@ "turndown": "^7.1.3", "xlsx": "^0.18.5", "xml2js": "^0.6.2", - "youtube-transcript": "^1.0.6" + "youtube-transcript": "^1.2.1" }, "nodemonConfig": { "ignore": [ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f4a6871..40afd59 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -44,6 +44,9 @@ dependencies: googleapis: specifier: ^131.0.0 version: 131.0.0 + he: + specifier: ^1.2.0 + version: 1.2.0 jira.js: specifier: ^3.0.2 version: 3.0.2 @@ -56,6 +59,9 @@ dependencies: octokit: specifier: ^3.1.2 version: 3.1.2 + onedrive-api: + specifier: ^1.1.1 + version: 1.1.1 openai: specifier: ^4.13.0 version: 4.28.4 @@ -74,12 +80,15 @@ dependencies: turndown: specifier: ^7.1.3 version: 7.1.3 + xlsx: + specifier: ^0.18.5 + version: 0.18.5 xml2js: specifier: ^0.6.2 version: 0.6.2 youtube-transcript: - specifier: ^1.0.6 - version: 1.0.6 + specifier: ^1.2.1 + version: 1.2.1 devDependencies: '@babel/preset-env': @@ -94,6 +103,9 @@ devDependencies: '@types/fluent-ffmpeg': specifier: ^2.1.24 version: 2.1.24 + '@types/he': + specifier: ^1.2.3 + version: 1.2.3 '@types/jest': specifier: ^29.5.11 version: 29.5.11 @@ -112,6 +124,9 @@ devDependencies: babel-jest: specifier: ^29.7.0 version: 29.7.0(@babel/core@7.23.9) + cross-env: + specifier: ^7.0.3 + version: 7.0.3 eslint: specifier: 8.35.0 version: 8.35.0 @@ -1966,6 +1981,10 @@ packages: '@jridgewell/resolve-uri': 3.1.1 '@jridgewell/sourcemap-codec': 1.4.15 + /@microsoft/microsoft-graph-types@2.40.0: + resolution: {integrity: sha512-1fcPVrB/NkbNcGNfCy+Cgnvwxt6/sbIEEFgZHFBJ670zYLegENYJF8qMo7x3LqBjWX2/Eneq5BVVRCLTmlJN+g==} + dev: false + /@nangohq/node@0.36.100: resolution: {integrity: sha512-Zap2JYdG+EIz0JC0ogkjnFyAX3kgfHhXd+b4gDq0FoAepjRYaFO8MVcWnsfYrQIN/+B7dLbPKXz6HVXCchag9Q==} engines: {node: '>=16.7'} @@ -2370,6 +2389,11 @@ packages: resolution: {integrity: sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA==} dev: true + /@sindresorhus/is@5.6.0: + resolution: {integrity: sha512-TV7t8GKYaJWsn00tFDqBw8+Uqmr8A0fRU1tvTQhyZzGv0sJCGRQL3JGMI3ucuKo3XIZdUP+Lx7/gh2t3lewy7g==} + engines: {node: '>=14.16'} + dev: false + /@sinonjs/commons@3.0.1: resolution: {integrity: sha512-K3mCHKQ9sVh8o1C9cxkwxaOmXoAMlDxC1mYyHrjqOWEcBjYr76t96zL2zlj5dUGZ3HSw240X1qgH3Mjf1yJWpQ==} dependencies: @@ -2382,6 +2406,13 @@ packages: '@sinonjs/commons': 3.0.1 dev: true + /@szmarczak/http-timer@5.0.1: + resolution: {integrity: sha512-+PmQX0PiAYPMeVYe237LJAYvOMYW1j2rH5YROyS3b4CTVJum34HfRvKvAzozHAQG0TnHNdUfY9nCeUyRAs//cw==} + engines: {node: '>=14.16'} + dependencies: + defer-to-connect: 2.0.1 + dev: false + /@tootallnate/quickjs-emscripten@0.23.0: resolution: {integrity: sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==} dev: false @@ -2445,6 +2476,14 @@ packages: '@types/node': 20.11.7 dev: true + /@types/he@1.2.3: + resolution: {integrity: sha512-q67/qwlxblDzEDvzHhVkwc1gzVWxaNxeyHUBF4xElrvjL11O+Ytze+1fGpBHlr/H9myiBUaUXNnNPmBHxxfAcA==} + dev: true + + /@types/http-cache-semantics@4.0.4: + resolution: {integrity: sha512-1m0bIFVc7eJWyve9S0RnuRgcQqF/Xd5QsUZAZeQFr1Q3/p9JWoQQEqmVy+DPTNpGXwhgIetAoYF8JSc33q29QA==} + dev: false + /@types/istanbul-lib-coverage@2.0.6: resolution: {integrity: sha512-2QF/t/auWm0lsy8XtKVPG19v3sSOQlJe/YHZgfjb/KBBHOGSV+J2q/S671rcq9uTBrLAXmZpqJiaQbMT+zNU1w==} dev: true @@ -2751,6 +2790,11 @@ packages: engines: {node: '>=0.4.0'} dev: true + /adler-32@1.3.1: + resolution: {integrity: sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==} + engines: {node: '>=0.8'} + dev: false + /agent-base@7.1.0: resolution: {integrity: sha512-o/zjMZRhJxny7OyEF+Op8X+efiELC7k7yOjMzgfzVqOzXqkBkWI79YoTdOtsuWd5BWhAGAuOY/Xa6xpiaWXiNg==} engines: {node: '>= 14'} @@ -3117,6 +3161,24 @@ packages: engines: {node: '>=8'} dev: false + /cacheable-lookup@7.0.0: + resolution: {integrity: sha512-+qJyx4xiKra8mZrcwhjMRMUhD5NR1R8esPkzIYxX96JiecFoxAXFuz/GpR3+ev4PE1WamHip78wV0vcmPQtp8w==} + engines: {node: '>=14.16'} + dev: false + + /cacheable-request@10.2.14: + resolution: {integrity: sha512-zkDT5WAF4hSSoUgyfg5tFIxz8XQK+25W/TLVojJTMKBaxevLBBtLxgqguAuVQB8PVW79FVjHcU+GJ9tVbDZ9mQ==} + engines: {node: '>=14.16'} + dependencies: + '@types/http-cache-semantics': 4.0.4 + get-stream: 6.0.1 + http-cache-semantics: 4.1.1 + keyv: 4.5.4 + mimic-response: 4.0.0 + normalize-url: 8.0.1 + responselike: 3.0.0 + dev: false + /call-bind@1.0.5: resolution: {integrity: sha512-C3nQxfFZxFRVoJoGKKI8y3MOEo129NQ+FgQ08iye+Mk4zNZZGdjfs06bVTr+DBSlA66Q2VEcMki/cUCP4SercQ==} dependencies: @@ -3142,8 +3204,12 @@ packages: resolution: {integrity: sha512-mtj5ur2FFPZcCEpXFy8ADXbDACuNFXg6mxVDqp7tqooX6l3zwm+d8EPoeOSIFRDvHs8qu7/SLFOGniULkcH2iA==} dev: true - /centra@2.6.0: - resolution: {integrity: sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ==} + /cfb@1.2.2: + resolution: {integrity: sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==} + engines: {node: '>=0.8'} + dependencies: + adler-32: 1.3.1 + crc-32: 1.2.2 dev: false /chalk@2.4.2: @@ -3247,6 +3313,11 @@ packages: engines: {iojs: '>= 1.0.0', node: '>= 0.12.0'} dev: true + /codepage@1.15.0: + resolution: {integrity: sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==} + engines: {node: '>=0.8'} + dev: false + /collect-v8-coverage@1.0.2: resolution: {integrity: sha512-lHl4d5/ONEbLlJvaJNtsF/Lz+WvB07u2ycqTYbdrq7UypDXailES4valYb2eWiJFxZlVmpGekfqoxQhzyFdT4Q==} dev: true @@ -3326,6 +3397,12 @@ packages: typescript: 4.9.5 dev: false + /crc-32@1.2.2: + resolution: {integrity: sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==} + engines: {node: '>=0.8'} + hasBin: true + dev: false + /create-jest@29.7.0: resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} @@ -3344,6 +3421,14 @@ packages: - ts-node dev: true + /cross-env@7.0.3: + resolution: {integrity: sha512-+/HKd6EgcQCJGh2PSjZuUitQBQynKor4wrFbRg4DtAgS1aWO+gU52xpH7M9ScGgXSYmAVS9bIJ8EzuaGw0oNAw==} + engines: {node: '>=10.14', npm: '>=6', yarn: '>=1'} + hasBin: true + dependencies: + cross-spawn: 7.0.3 + dev: true + /cross-fetch@4.0.0: resolution: {integrity: sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==} dependencies: @@ -3411,6 +3496,13 @@ packages: engines: {node: '>=10'} dev: false + /decompress-response@6.0.0: + resolution: {integrity: sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==} + engines: {node: '>=10'} + dependencies: + mimic-response: 3.1.0 + dev: false + /dedent@1.5.1: resolution: {integrity: sha512-+LxW+KLWxu3HW3M2w2ympwtqPrqYRzU8fqi6Fhd18fBALe15blJPI/I4+UHveMVG6lJqB4JNd4UG0S5cnVHwIg==} peerDependencies: @@ -3429,6 +3521,11 @@ packages: engines: {node: '>=0.10.0'} dev: true + /defer-to-connect@2.0.1: + resolution: {integrity: sha512-4tvttepXG1VaYGrRibk5EwJd1t4udunSOVMdLSAL6mId1ix438oPwPZMALY41FCijukO1L0twNcGsdzS7dHgDg==} + engines: {node: '>=10'} + dev: false + /define-data-property@1.1.1: resolution: {integrity: sha512-E7uGkTzkk1d0ByLeSc6ZsFS79Axg+m1P/VsgYsxHgiuc3tFSj+MjMIwe90FC4lOAZzNBdY7kkO2P2wKdsQ1vgQ==} engines: {node: '>= 0.4'} @@ -3960,6 +4057,11 @@ packages: resolution: {integrity: sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==} dev: false + /form-data-encoder@2.1.4: + resolution: {integrity: sha512-yDYSgNMraqvnxiEXO4hi88+YZxaHC6QKzb5N84iRCTDeRO7ZALpir/lVmf/uXUhnwUr2O4HU8s/n6x+yNjQkHw==} + engines: {node: '>= 14.17'} + dev: false + /form-data@4.0.0: resolution: {integrity: sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==} engines: {node: '>= 6'} @@ -3977,6 +4079,11 @@ packages: web-streams-polyfill: 4.0.0-beta.3 dev: false + /frac@1.1.2: + resolution: {integrity: sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==} + engines: {node: '>=0.8'} + dev: false + /fs-extra@8.1.0: resolution: {integrity: sha512-yhlQgA6mnOJUKOsRUFsgJdQCvkKhcz8tlZG5HBQfReYZy46OwLcY+Zia0mtdHsOo9y/hP+CxMN0TU9QxoOtG4g==} engines: {node: '>=6 <7 || >=8'} @@ -4176,6 +4283,23 @@ packages: get-intrinsic: 1.2.2 dev: false + /got@12.6.0: + resolution: {integrity: sha512-WTcaQ963xV97MN3x0/CbAriXFZcXCfgxVp91I+Ze6pawQOa7SgzwSx2zIJJsX+kTajMnVs0xcFD1TxZKFqhdnQ==} + engines: {node: '>=14.16'} + dependencies: + '@sindresorhus/is': 5.6.0 + '@szmarczak/http-timer': 5.0.1 + cacheable-lookup: 7.0.0 + cacheable-request: 10.2.14 + decompress-response: 6.0.0 + form-data-encoder: 2.1.4 + get-stream: 6.0.1 + http2-wrapper: 2.2.1 + lowercase-keys: 3.0.0 + p-cancelable: 3.0.0 + responselike: 3.0.0 + dev: false + /graceful-fs@4.2.11: resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} @@ -4231,6 +4355,7 @@ packages: /he@1.2.0: resolution: {integrity: sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==} + hasBin: true dev: false /html-escaper@2.0.2: @@ -4246,6 +4371,10 @@ packages: entities: 4.5.0 dev: false + /http-cache-semantics@4.1.1: + resolution: {integrity: sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ==} + dev: false + /http-proxy-agent@7.0.0: resolution: {integrity: sha512-+ZT+iBxVUQ1asugqnD6oWoRiS25AkjNfG085dKJGtGxkdwLQrMKU5wJr2bOOFAXzKcTuqq+7fZlTMgG3SRfIYQ==} engines: {node: '>= 14'} @@ -4256,6 +4385,14 @@ packages: - supports-color dev: false + /http2-wrapper@2.2.1: + resolution: {integrity: sha512-V5nVw1PAOgfI3Lmeaj2Exmeg7fenjhRUgz1lPSezy1CuhPYbgQtbQj4jZfEAEMlaL+vupsvhjqCyjzob0yxsmQ==} + engines: {node: '>=10.19.0'} + dependencies: + quick-lru: 5.1.1 + resolve-alpn: 1.2.1 + dev: false + /https-proxy-agent@7.0.2: resolution: {integrity: sha512-NmLNjm6ucYwtcUmL7JQC1ZQ57LmHP4lT15FQ8D61nak1rO6DH+fz5qNK2Ap5UN4ZapYICE3/0KodcLYSPsPbaA==} engines: {node: '>= 14'} @@ -4351,6 +4488,10 @@ packages: hasown: 2.0.0 dev: true + /is-empty-object@1.1.1: + resolution: {integrity: sha512-Ezcg4AgwFtRkqP4T1mgR04VCM30ruQKHqYpNPof2TUJTxfcQcAvlUS2j1tDu0VBKGvGnLHrSPhwIugKrw9bR9A==} + dev: false + /is-extglob@2.1.1: resolution: {integrity: sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==} engines: {node: '>=0.10.0'} @@ -4929,7 +5070,6 @@ packages: /json-buffer@3.0.1: resolution: {integrity: sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==} - dev: true /json-parse-even-better-errors@2.3.1: resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} @@ -5016,7 +5156,6 @@ packages: resolution: {integrity: sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==} dependencies: json-buffer: 3.0.1 - dev: true /kleur@3.0.3: resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==} @@ -5125,6 +5264,11 @@ packages: underscore: 1.13.6 dev: false + /lowercase-keys@3.0.0: + resolution: {integrity: sha512-ozCC6gdQ+glXOQsveKD0YsDy8DSQFjDTz4zyzEHNV5+JP5D62LmfDZ6o1cycFx9ouG940M5dE8C8CTewdj2YWQ==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + dev: false + /lru-cache@10.2.0: resolution: {integrity: sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==} engines: {node: 14 || >=16.14} @@ -5219,6 +5363,16 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} + /mimic-response@3.1.0: + resolution: {integrity: sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==} + engines: {node: '>=10'} + dev: false + + /mimic-response@4.0.0: + resolution: {integrity: sha512-e5ISH9xMYU0DzrT+jl8q2ze9D6eWBto+I8CNpe+VI+K2J/F/k3PdkdTdz4wvGVH4NTpo+NRYTVIuMQEMMcsLqg==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + dev: false + /minimatch@3.1.2: resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} dependencies: @@ -5304,6 +5458,11 @@ packages: resolution: {integrity: sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==} engines: {node: '>=0.10.0'} + /normalize-url@8.0.1: + resolution: {integrity: sha512-IO9QvjUMWxPQQhs60oOu10CRkWCiZzSUkzbXGGV9pviYl1fXYcvkzQ5jV9z8Y6un8ARoVRl4EtC6v6jNqbaJ/w==} + engines: {node: '>=14.16'} + dev: false + /npm-run-path@4.0.1: resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} engines: {node: '>=8'} @@ -5350,6 +5509,14 @@ packages: dependencies: wrappy: 1.0.2 + /onedrive-api@1.1.1: + resolution: {integrity: sha512-6rtox7rnxG7xVApz5TSFPnSGM6A9s/s5PUIaGiBdpapAyJogdIVmP6LqN5ltp3L2CeotTIXqUUjOTxHUmSRkfg==} + dependencies: + '@microsoft/microsoft-graph-types': 2.40.0 + got: 12.6.0 + is-empty-object: 1.1.1 + dev: false + /onetime@5.1.2: resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==} engines: {node: '>=6'} @@ -5389,6 +5556,11 @@ packages: type-check: 0.4.0 dev: true + /p-cancelable@3.0.0: + resolution: {integrity: sha512-mlVgR3PGuzlo0MmTdk4cXqXWlwQDLnONTAg6sm62XkMJEiRxN3GL3SffkYvqwonbkJBcrI7Uvv5Zh9yjvn2iUw==} + engines: {node: '>=12.20'} + dev: false + /p-limit@2.3.0: resolution: {integrity: sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==} engines: {node: '>=6'} @@ -5522,13 +5694,6 @@ packages: resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==} dev: false - /phin@3.7.0: - resolution: {integrity: sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg==} - engines: {node: '>= 8'} - dependencies: - centra: 2.6.0 - dev: false - /picocolors@1.0.0: resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==} dev: true @@ -5683,6 +5848,11 @@ packages: resolution: {integrity: sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag==} dev: false + /quick-lru@5.1.1: + resolution: {integrity: sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==} + engines: {node: '>=10'} + dev: false + /react-is@18.2.0: resolution: {integrity: sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==} dev: true @@ -5753,6 +5923,10 @@ packages: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} + /resolve-alpn@1.2.1: + resolution: {integrity: sha512-0a1F4l73/ZFZOakJnQ3FvkJ2+gSTQWz/r2KE5OdDY0TxPm5h4GkqkWWfM47T7HsbnOtcJVEF4epCVy6u7Q3K+g==} + dev: false + /resolve-cwd@3.0.0: resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==} engines: {node: '>=8'} @@ -5781,6 +5955,13 @@ packages: supports-preserve-symlinks-flag: 1.0.0 dev: true + /responselike@3.0.0: + resolution: {integrity: sha512-40yHxbNcl2+rzXvZuVkrYohathsSJlMTXKryG5y8uciHv1+xDLHQpgjG64JUO9nrEq2jGLH6IZ8BcZyw3wrweg==} + engines: {node: '>=14.16'} + dependencies: + lowercase-keys: 3.0.0 + dev: false + /reusify@1.0.4: resolution: {integrity: sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==} engines: {iojs: '>=1.0.0', node: '>=0.10.0'} @@ -5944,6 +6125,13 @@ packages: /sprintf-js@1.0.3: resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==} + /ssf@0.11.2: + resolution: {integrity: sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==} + engines: {node: '>=0.8'} + dependencies: + frac: 1.1.2 + dev: false + /stack-utils@2.0.6: resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==} engines: {node: '>=10'} @@ -6414,6 +6602,16 @@ packages: dependencies: isexe: 2.0.0 + /wmf@1.0.2: + resolution: {integrity: sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==} + engines: {node: '>=0.8'} + dev: false + + /word@0.3.0: + resolution: {integrity: sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==} + engines: {node: '>=0.8'} + dev: false + /wrap-ansi@7.0.0: resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==} engines: {node: '>=10'} @@ -6455,6 +6653,20 @@ packages: optional: true dev: false + /xlsx@0.18.5: + resolution: {integrity: sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==} + engines: {node: '>=0.8'} + hasBin: true + dependencies: + adler-32: 1.3.1 + cfb: 1.2.2 + codepage: 1.15.0 + crc-32: 1.2.2 + ssf: 0.11.2 + wmf: 1.0.2 + word: 0.3.0 + dev: false + /xml2js@0.6.2: resolution: {integrity: sha512-T4rieHaC1EXcES0Kxxj4JWgaUQHDk+qwHcYOCFHfiwKz7tOVPLq7Hjq9dM1WCMhylqMEfP7hMcOIChvotiZegA==} engines: {node: '>=4.0.0'} @@ -6527,8 +6739,7 @@ packages: engines: {node: '>=10'} dev: true - /youtube-transcript@1.0.6: - resolution: {integrity: sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA==} - dependencies: - phin: 3.7.0 + /youtube-transcript@1.2.1: + resolution: {integrity: sha512-TvEGkBaajKw+B6y91ziLuBLsa5cawgowou+Bk0ciGpjELDfAzSzTGXaZmeSSkUeknCPpEr/WGApOHDwV7V+Y9Q==} + engines: {node: '>=18.0.0'} dev: false diff --git a/src/__tests__/providers/YouTube/index.test.ts b/src/__tests__/providers/YouTube/index.test.ts index 37ed0ae..afadc2c 100644 --- a/src/__tests__/providers/YouTube/index.test.ts +++ b/src/__tests__/providers/YouTube/index.test.ts @@ -32,5 +32,5 @@ describe("YouTubeDataProvider", () => { "and that's pretty much all there is to" ); expect(documents[0].content.toLowerCase()).toContain("say"); - }); + }, 60000); }); diff --git a/src/providers/WebScraper/single_url.ts b/src/providers/WebScraper/single_url.ts index b540ea3..76cfd58 100644 --- a/src/providers/WebScraper/single_url.ts +++ b/src/providers/WebScraper/single_url.ts @@ -75,8 +75,8 @@ export async function scrapSingleUrl(urlToScrap: string, toMarkdown: boolean = t const text = sanitizeText(formattedText.trim()); if (metadata) { - console.log(markdownContent) - console.log("here", toMarkdown) + // console.log(markdownContent) + // console.log("here", toMarkdown) return { content: text, provider: "web-scraper", diff --git a/src/providers/YouTube/index.ts b/src/providers/YouTube/index.ts index f42b0f9..ad1b82e 100644 --- a/src/providers/YouTube/index.ts +++ b/src/providers/YouTube/index.ts @@ -3,6 +3,7 @@ import { Document } from "../../entities/Document"; import { YoutubeTranscript } from "youtube-transcript"; // import puppeteer from "puppeteer"; import { Progress } from "../../entities/Progress"; +import he from 'he'; export type YouTubeInputOptions = { urls: string[]; @@ -42,11 +43,13 @@ export class YouTubeDataProvider implements DataProvider { let content = ""; try { - const data = await YoutubeTranscript.fetchTranscript(this.urls[i]); + const data = await YoutubeTranscript.fetchTranscript(this.urls[i], { lang: "en" }); for (const item of data) { - content += item.text + " \n"; + content += he.decode(item.text) + " \n"; } + content = he.decode(content); + documents.push({ content: content.replace(/ +/g, " ").trim(), metadata: {