Skip to content

Commit

Permalink
make compatible for publishing
Browse files Browse the repository at this point in the history
  • Loading branch information
themousepotato committed Feb 26, 2021
1 parent 4fdfb9a commit 59e8c42
Show file tree
Hide file tree
Showing 30 changed files with 100 additions and 89 deletions.
70 changes: 38 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,50 +15,56 @@ The database contains the following fields:
5. Meta (a JSON encoded field of whatever fields each source provides)
```

## Installation
```bash
$ pip install unscrapulous
```

## Usage
```bash
$ unscrapulous --config=config.toml --output=output.csv
```

## Development
```bash
$ git clone [email protected]:themousepotato/unscrapulous.git
$ cd unscrapulous
$ curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python -
$ poetry install
$ poetry shell
```

## Usage
```bash
$ ./unscrapulous.py --config=config.toml --output=output.csv
$ poetry build
$ pip install dist/unscrapulous-*.whl
```

## Config
The `config.toml` file has the following format:
```toml
[scrapers]
arbitration-awards-bse = false
arbitration-awards-nse = false
bse-defaulter-and-expelled-members = false
icex-defaulter-members = false
icex-expelled-members = false
income-tax-defaulters = false
irda-blacklisted = false
mca-company-defaulter-list = false
mca-director-defaulter-list = false
mca-director-disqualified-list = false
mca-proclaimed-offenders-ind = false
mcx-action-ap = false
mcx-defaulter-members = false
mcx-secretaries-defaulter-list = false
mse-arbitral-awards = false
ncdex-suspended-defaulted-expelled-debarred-members = false
nse-defaulted-members = false
nse-expelled-members = false
nse-regulatory-defaulting-clients = false
sebi-debarred-bse = false
sebi-debarred-nse = false
sfio-convicted = true
sfio-proclaimed-offenders = false
unsc-1988 = false
unsc-consolidated-list = false
wildlife-crime-convicts = true
arbitration_awards_bse = false
arbitration_awards_nse = false
bse_defaulter_and_expelled_members = false
icex_defaulter_members = false
icex_expelled_members = false
income_tax_defaulters = false
irda_blacklisted = false
mca_company_defaulter_list = false
mca_director_defaulter_list = false
mca_director_disqualified_list = false
mca_proclaimed_offenders_ind = false
mcx_action_ap = false
mcx_defaulter_members = false
mcx_secretaries_defaulter_list = false
mse_arbitral_awards = false
ncdex_suspended_defaulted_expelled_debarred_members = false
nse_defaulted_members = false
nse_expelled_members = false
nse_regulatory_defaulting_clients = false
sebi_debarred_bse = true
sebi_debarred_nse = true
sfio_convicted = false
sfio_proclaimed_offenders = false
unsc_1988 = false
unsc_consolidated_list = false
wildlife_crime_convicts = false
```

## Roadmap
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "unscrapulous"
version = "0.1.0"
version = "0.1.1"
description = "A utility that scrapes lists of unscrupulous entities (barred from doing financial business) published by various legal institutions"
authors = ["Navaneeth Suresh <[email protected]>"]
license = "MIT License"
Expand All @@ -21,6 +21,9 @@ toml = "^0.10.2"

[tool.poetry.dev-dependencies]

[tool.poetry.scripts]
unscrapulous = 'unscrapulous.unscrapulous:main'

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.bseindia.com/investors/ArbitAwards.aspx'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www1.nseindia.com/invest/dynaContent/arbitration_award.jsp?requestPage=main&qryFlag=yes'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.bseindia.com/static/members/List_defaulters_Expelled_members.aspx'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
52 changes: 26 additions & 26 deletions unscrapulous/config.toml
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
[scrapers]
arbitration-awards-bse = false
arbitration-awards-nse = false
bse-defaulter-and-expelled-members = false
icex-defaulter-members = false
icex-expelled-members = false
income-tax-defaulters = false
irda-blacklisted = false
mca-company-defaulter-list = false
mca-director-defaulter-list = false
mca-director-disqualified-list = false
mca-proclaimed-offenders-ind = false
mcx-action-ap = false
mcx-defaulter-members = false
mcx-secretaries-defaulter-list = false
mse-arbitral-awards = false
ncdex-suspended-defaulted-expelled-debarred-members = false
nse-defaulted-members = false
nse-expelled-members = false
nse-regulatory-defaulting-clients = false
sebi-debarred-bse = false
sebi-debarred-nse = false
sfio-convicted = true
sfio-proclaimed-offenders = false
unsc-1988 = false
unsc-consolidated-list = false
wildlife-crime-convicts = true
arbitration_awards_bse = false
arbitration_awards_nse = false
bse_defaulter_and_expelled_members = false
icex_defaulter_members = false
icex_expelled_members = false
income_tax_defaulters = false
irda_blacklisted = false
mca_company_defaulter_list = false
mca_director_defaulter_list = false
mca_director_disqualified_list = false
mca_proclaimed_offenders_ind = false
mcx_action_ap = false
mcx_defaulter_members = false
mcx_secretaries_defaulter_list = false
mse_arbitral_awards = false
ncdex_suspended_defaulted_expelled_debarred_members = false
nse_defaulted_members = false
nse_expelled_members = false
nse_regulatory_defaulting_clients = false
sebi_debarred_bse = true
sebi_debarred_nse = true
sfio_convicted = false
sfio_proclaimed_offenders = false
unsc_1988 = false
unsc_consolidated_list = false
wildlife_crime_convicts = false
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.icexindia.com/membership/expelled-defaulter-surrendered-members'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.icexindia.com/membership/expelled-defaulter-surrendered-members'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'http://office.incometaxindia.gov.in/administration/_layouts/15/inplview.aspx?List={5A26177B-D7A0-4251-843D-5E6C0B3C3DF2}&View={D8DD9754-8FD1-4D72-9908-727646E99CA0}&ViewCount=450&IsXslView=TRUE&IsCSR=TRUE&Paged=TRUE&p_ID='
FILE_URL = 'http://office.incometaxindia.gov.in/administration/Lists/Tax%20Defaulters/AllItems.aspx'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *
import datetime

SOURCE = 'https://agencyportal.irdai.gov.in/PublicAccess/BlackListedAgent.aspx'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

PARENT_SOURCE = 'http://www.mca.gov.in'
SOURCE = 'http://www.mca.gov.in/MinistryV2/defaultercompanieslist.html'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

PARENT_SOURCE = 'http://www.mca.gov.in'
SOURCE = 'http://www.mca.gov.in/MinistryV2/defaulterdirectorslist.html'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

PARENT_SOURCE = 'http://www.mca.gov.in'
SOURCE = 'http://www.mca.gov.in/MinistryV2/disqualifieddirectorslist.html'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

PARENT_SOURCE = 'http://www.mca.gov.in'
SOURCE = 'http://www.mca.gov.in/MinistryV2/proclaimedoffenders.html'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

PARENT_SOURCE = 'https://www.mcxindia.com'
SOURCE = 'https://www.mcxindia.com/membership/notice-board/notice-board-disciplinary-action'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

PARENT_SOURCE = 'https://www.mcxindia.com'
SOURCE = 'https://www.mcxindia.com/membership/notice-board/notice-board-disciplinary-action'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.mcxindia.com/Investor-Services/defaulters/defaulters-list'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.msei.in/investors/list-of-arbitrators'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://ncdex.com/suspended_member/latest_info'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www1.nseindia.com/invest/json/def_members.json'
FILE_PARENT_URL = 'https://www1.nseindia.com/invest/resources/download/'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www1.nseindia.com/invest/json/exp_members.json'
FILE_PARENT_URL = 'https://www1.nseindia.com/invest/resources/download/'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.nseindia.com/regulations/exchange-defaulting-clients'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

PARENT_SOURCES = ['https://www.bseindia.com', 'https://www.bseindia.com/investors/']
SOURCE = 'https://www.bseindia.com/investors/debent.aspx'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.nseindia.com/regulations/member-sebi-debarred-entities'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://sfio.nic.in/'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://sfio.nic.in/'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
2 changes: 1 addition & 1 deletion unscrapulous/unsc-1988.py → unscrapulous/unsc_1988.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://scsanctions.un.org/taliban/'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from utils import *
from unscrapulous.utils import *

SOURCE = 'https://www.un.org/securitycouncil/content/un-sc-consolidated-list'
OUTPUT_DIR = '/tmp/unscrapulous/files'
Expand Down
Loading

0 comments on commit 59e8c42

Please sign in to comment.