Skip to content

Commit

Permalink
Merge branch 'common-voice:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
bact authored Jan 25, 2024
2 parents 9e5ec79 + c559db8 commit b874da6
Show file tree
Hide file tree
Showing 45 changed files with 14,706,592 additions and 664 deletions.
40 changes: 26 additions & 14 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,36 @@ name: CI
on: [push, pull_request]

jobs:
lint:
runs-on: "ubuntu-latest"
steps:
- uses: actions/checkout@v3
- uses: hecrj/setup-rust-action@50a120e4d34903c2c1383dec0e9b1d349a9cc2b1
with:
rust-version: nightly-2023-06-28
components: clippy
- name: Run clippy
run: cargo clippy --all-targets --all-features

test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macOS-latest]
rust: [nightly]
rust: [nightly-2023-06-28]

steps:

# SETUP
- uses: hecrj/[email protected]
with:
rust-version: ${{ matrix.rust }}
components: clippy
- uses: actions/checkout@master

# TESTS
- name: Run clippy
run: cargo clippy --all-targets --all-features
- name: Run tests
run: cargo test --verbose
- uses: actions/checkout@v3
- uses: hecrj/setup-rust-action@50a120e4d34903c2c1383dec0e9b1d349a9cc2b1
with:
rust-version: ${{ matrix.rust }}
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run tests
run: cargo test --verbose
76 changes: 76 additions & 0 deletions .github/workflows/manual-dispatch-wikipedia-rerun.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Manually starting a Wikipedia Extraction for a given language
# This is the main way to run an extraction on articles that are
# newer than a given date.

name: Manual Dispatch - Wikipedia Extraction Rerun

on:
workflow_dispatch:
inputs:
language:
description: "Language Code"
required: true
default: ""
endDate:
description: "Earliest date to fetch for"
required: true
default: ""
startDate:
description: "Latest date to fetch for (defaults to today)"
required: false
default: ""

jobs:
extract:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
node-version: [18.x]
rust: [nightly-2023-06-28]

steps:
# SETUP
- name: Maximize build space
uses: easimon/maximize-build-space@b4d02c14493a9653fe7af06cc89ca5298071c66e
with:
root-reserve-mb: 512
swap-size-mb: 1024
remove-dotnet: "true"
remove-android: "true"
remove-haskell: "true"
- uses: hecrj/setup-rust-action@50a120e4d34903c2c1383dec0e9b1d349a9cc2b1
with:
rust-version: ${{ matrix.rust }}
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v3
with:
node-version: ${{ matrix.node-version }}
- uses: actions/checkout@v3

# NEW ARTICLES
- name: Query new articles
run: |
mkdir output
cd scripts
npm ci
node new-wikipedia-articles.js > ../output/new-article-titles.txt
env:
CI: true
WIKI_START_DATE: ${{ github.event.inputs.startDate }}
WIKI_END_DATE: ${{ github.event.inputs.endDate }}
WIKI_LOCALE: ${{ github.event.inputs.language }}

# EXTRACTION ON NEW ARTICLES ONLY
- name: Full Wikipedia Extraction - ${{ github.event.inputs.language }}
env:
LANGUAGE: ${{ github.event.inputs.language }}
run: ./scripts/extraction.sh extract "$LANGUAGE" ./output/new-article-titles.txt
- name: Deduplicate Wikipedia Extraction
run: ./scripts/dedupe.sh extract.txt

# UPLOAD
- uses: actions/upload-artifact@v3
with:
name: extraction
path: output/*
59 changes: 28 additions & 31 deletions .github/workflows/manual-dispatch-wikipedia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,46 +9,43 @@ on:
workflow_dispatch:
inputs:
language:
description: 'Language Code'
description: "Language Code"
required: true
default: ''
default: ""

jobs:
extract:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
rust: [nightly]
rust: [nightly-2023-06-28]

steps:

# SETUP
- name: Maximize build space
uses: easimon/maximize-build-space@master
with:
root-reserve-mb: 512
swap-size-mb: 1024
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
- uses: hecrj/setup-rust-action@e0938bab41405f7485391869b453779c5290099d
with:
rust-version: ${{ matrix.rust }}
- uses: actions/checkout@master
# SETUP
- name: Maximize build space
uses: easimon/maximize-build-space@b4d02c14493a9653fe7af06cc89ca5298071c66e
with:
root-reserve-mb: 512
swap-size-mb: 1024
remove-dotnet: "true"
remove-android: "true"
remove-haskell: "true"
- uses: hecrj/setup-rust-action@50a120e4d34903c2c1383dec0e9b1d349a9cc2b1
with:
rust-version: ${{ matrix.rust }}
- uses: actions/checkout@v3

# EXTRACTION
- name: Full Wikipedia Extraction - ${{ github.event.inputs.language }}
run: ./scripts/extraction.sh extract ${{ github.event.inputs.language }}
- name: Deduplicate Wikipedia Extraction
run: ./scripts/dedupe.sh extract.txt
# EXTRACTION
- name: Full Wikipedia Extraction - ${{ github.event.inputs.language }}
env:
LANGUAGE: ${{ github.event.inputs.language }}
run: ./scripts/extraction.sh extract "$LANGUAGE"
- name: Deduplicate Wikipedia Extraction
run: ./scripts/dedupe.sh extract.txt

# UPLOAD
- uses: actions/upload-artifact@v2
with:
name: extraction
path: output/*
- uses: actions/upload-artifact@v2
with:
name: wikiextractor-output
path: text/*
# UPLOAD
- uses: actions/upload-artifact@v2
with:
name: extraction
path: output/*
59 changes: 28 additions & 31 deletions .github/workflows/manual-dispatch-wikisource.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,46 +9,43 @@ on:
workflow_dispatch:
inputs:
language:
description: 'Language Code'
description: "Language Code"
required: true
default: ''
default: ""

jobs:
extract:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
rust: [nightly]
rust: [nightly-2023-06-28]

steps:

# SETUP
- name: Maximize build space
uses: easimon/maximize-build-space@master
with:
root-reserve-mb: 512
swap-size-mb: 1024
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
- uses: hecrj/setup-rust-action@e0938bab41405f7485391869b453779c5290099d
with:
rust-version: ${{ matrix.rust }}
- uses: actions/checkout@master
# SETUP
- name: Maximize build space
uses: easimon/maximize-build-space@b4d02c14493a9653fe7af06cc89ca5298071c66e
with:
root-reserve-mb: 512
swap-size-mb: 1024
remove-dotnet: "true"
remove-android: "true"
remove-haskell: "true"
- uses: hecrj/setup-rust-action@50a120e4d34903c2c1383dec0e9b1d349a9cc2b1
with:
rust-version: ${{ matrix.rust }}
- uses: actions/checkout@v3

# EXTRACTION
- name: Full Wikisource Extraction - ${{ github.event.inputs.language }}
run: ./scripts/extraction.sh extract-wikisource ${{ github.event.inputs.language }}
- name: Deduplicate Wikipedia Extraction
run: ./scripts/dedupe.sh extract-wikisource.txt
# EXTRACTION
- name: Full Wikisource Extraction - ${{ github.event.inputs.language }}
env:
LANGUAGE: ${{ github.event.inputs.language }}
run: ./scripts/extraction.sh extract-wikisource "$LANGUAGE"
- name: Deduplicate Wikipedia Extraction
run: ./scripts/dedupe.sh extract-wikisource.txt

# UPLOAD
- uses: actions/upload-artifact@v2
with:
name: extraction
path: output/*
- uses: actions/upload-artifact@v2
with:
name: wikiextractor-output
path: text/*
# UPLOAD
- uses: actions/upload-artifact@v3
with:
name: extraction
path: output/*
Loading

0 comments on commit b874da6

Please sign in to comment.