Skip to content

Commit

Permalink
Merge pull request #8 from likhitha-surapaneni/add_check
Browse files Browse the repository at this point in the history
Add check to compare staging with archive files
  • Loading branch information
olaaustine authored Sep 26, 2023
2 parents 66cc470 + 370228f commit 35fa2c9
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 32 deletions.
86 changes: 57 additions & 29 deletions bin/sanitycheck_on_dbfiles.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python
import argparse
import os
import sys
import pdb
import logging
import re
Expand All @@ -19,15 +21,18 @@
" from the $DBNAME env variable")
parser.add_argument('--firepwd', help="FIRE api password. If not provided then it will try to guess the FIRE"
" pwd from the $FIRE_PWD env variable")
parser.add_argument('--directory', help="Directory to compare staging and archive" )
parser.add_argument('--log', default='INFO', help="Logging level. i.e. DEBUG, INFO, WARNING, ERROR, CRITICAL")


args = parser.parse_args()

if not os.path.isfile(args.settings):
raise Exception(f"Config file provided using --settings option({args.settings}) not found!")
sys.exit(f"Config file provided using --settings option({args.settings}) not found!")
# set the CONFIG_FILE env variable
os.environ["CONFIG_FILE"] = os.path.abspath(args.settings)

staging_path = '/nfs/1000g-work/G1K/archive_staging/'
archive_path ='/nfs/1000g-archive/vol1/'
# Parse config file
settingsO = ConfigParser()
settingsO.read(args.settings)
Expand Down Expand Up @@ -61,40 +66,63 @@
firepwd = os.getenv('FIRE_PWD')

if dbname is None:
raise Exception("$DBNAME undefined. You need either to pass the name of the "
sys.exit("$DBNAME undefined. You need either to pass the name of the "
"RESEQTRACK database using the --dbname option or set a $DBNAME "
"environment variable before running this script!")
if dbpwd is None:
raise Exception("$DBPWD undefined. You need either to pass the password of the MYSQL "
sys.exit("$DBPWD undefined. You need either to pass the password of the MYSQL "
"server containing the RESEQTRACK database using the --dbpwd option or set a $DBPWD environment "
"variable before running this script!")
if firepwd is None:
raise Exception("$FIRE_PWD undefined. You need either to pass the FIRE API password using the --firepwd option"
" or set a $FIRE_PWD environment variable before running this script!")

# connection to Reseqtrack DB
db = DB(pwd=dbpwd,
dbname=dbname)

# connection to FIRE api
api = API(pwd=firepwd)

flist = db.fetch_files_by_pattern(pattern='/nfs/1000g-archive/vol1/ftp/')

logger.info(f"Number of files returned with this pattern {len(flist)}")

tot_counter = 0
count = 0
for p in flist:
if count == 100:
logger.info(f"{tot_counter} lines processed!")
count = 0
tot_counter += 1
count += 1

if settingsO.get('ftp', 'ftp_mount') in p:
fire_path = re.sub(settingsO.get('ftp', 'ftp_mount') + "/", '', p)
fire_obj = None
fire_obj = api.fetch_object(firePath=fire_path)
if fire_obj is None:
print(f"ERROR: File witH PATH {p} is not archived in FIRE")
if args.directory:
#Files fetched in staging
#Files fetched from archive
basename = os.path.basename(args.directory.rstrip("/"))
staging_list = db.fetch_files_by_pattern(pattern=f"{staging_path}%{basename}")
len_staging_list = 0
len_archive_list = 0
if staging_list:
with open(f"{basename}_staging_files", 'w') as sf:
for staging_file in staging_list:
sf.write(f"{staging_file}\n")
len_staging_list = len(staging_list)
archive_list = db.fetch_files_by_pattern(pattern=f"{archive_path}%{basename}")
if archive_list:
with open(f"{basename}_archive_files", 'w') as af:
for archive_file in archive_list:
af.write(f"{archive_file}\n")
len_archive_list = len(archive_list)
logger.info(f"Number of files returned in staging: {len_staging_list}")
logger.info(f"Number of files returned in archive: {len_archive_list}")
if staging_list and archive_list:
logger.info(f"Staging files in {basename}_staging_files are not archived")

elif not args.directory:
if firepwd is None:
sys.exit("$FIRE_PWD undefined. You need either to pass the FIRE API password using the --firepwd option"
" or set a $FIRE_PWD environment variable before running this script!")
# connection to FIRE api
api = API(pwd=firepwd)
logger.info("No specific directory specified. Check all the files on ftp")
flist = db.fetch_files_by_pattern(pattern='/nfs/1000g-archive/vol1/ftp/')
logger.info(f"Number of files returned with this pattern /nfs/1000g-archive/vol1/ftp/ {len(flist)}")

tot_counter = 0
count = 0
for p in flist:
if count == 100:
logger.info(f"{tot_counter} lines processed!")
count = 0
tot_counter += 1
count += 1

if settingsO.get('ftp', 'ftp_mount') in p:
fire_path = re.sub(settingsO.get('ftp', 'ftp_mount') + "/", '', p)
fire_obj = None
fire_obj = api.fetch_object(firePath=fire_path)
if fire_obj is None:
print(f"ERROR: File witH PATH {p} is not archived in FIRE")
20 changes: 20 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,26 @@ Note the double asterisk, which indicates any subdirectory included in the paren
By default, the script will perform a dry run and the files will not be moved. You need to run
``move_files.py`` with the option ``--dry False`` to move them.

Sanity check
----------
The script to perform certain checks on archived files.

To compare the files between the staging and archive folders,
you need to run the script doing::

sanitycheck_on_dbfiles.py --settings settings.ini --dbpwd $DBPWD --dbname $DBNAME --directory dir_name

- ``--directory`` is the directory in the FTP area to compare staging and archive.
- ``--firepwd`` is the password for connecting the FIRE API

To check if all the files reported as archived on DB are present on FIRE FTP ,
you need to run the script doing::

sanitycheck_on_dbfiles.py --settings settings.ini --dbpwd $DBPWD --dbname $DBNAME --firepwd fire_pwd

Using ``--firepwd`` without ``--directory`` checks all the files in the firepath.


Indices and tables
==================

Expand Down
6 changes: 3 additions & 3 deletions igsr_archive/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,11 @@ def fetch_files_by_pattern(self, pattern):
------
pymysql.Error
"""

db_logger.debug(f"Fetching all files for pattern: {pattern}")
cursor = self.conn.cursor(pymysql.cursors.DictCursor)
query = "SELECT * FROM file WHERE name like %s"
cursor.execute(query, [pattern+'%'])
query = f"SELECT * FROM file WHERE name like '{pattern}%'"
cursor.execute(query)
file_list = []
try:
result_set = cursor.fetchall()
Expand Down

0 comments on commit 35fa2c9

Please sign in to comment.