macs3-project · taoliu · May 7, 2024 · Mar 28, 2024 · Mar 28, 2024 · Apr 9, 2024
diff --git a/.github/workflows/build-and-test-MACS3-macos.yml b/.github/workflows/build-and-test-MACS3-macos.yml
@@ -8,12 +8,14 @@ on:
     paths-ignore:
       - 'docs/**'
       - '**.md'
+      - 'ChangeLog'
       - '.github/workflows/**'
       - '!.github/workflows/build-and-test-MACS3-macos.yml'
   pull_request:
     paths-ignore:
       - 'docs/**'
       - '**.md'
+      - 'ChangeLog'
       - '.github/workflows/**'
       - '!.github/workflows/build-and-test-MACS3-macos.yml'
 

diff --git a/.github/workflows/build-and-test-MACS3-non-x64.yml b/.github/workflows/build-and-test-MACS3-non-x64.yml
@@ -8,12 +8,14 @@ on:
     paths-ignore:
       - 'docs/**'
       - '**.md'
+      - 'ChangeLog'
       - '.github/workflows/**'
       - '!.github/workflows/build-and-test-MACS3-non-x64.yml'
   pull_request:
     paths-ignore:
       - 'docs/**'
       - '**.md'
+      - 'ChangeLog'
       - '.github/workflows/**'
       - '!.github/workflows/build-and-test-MACS3-non-x64.yml'
 

diff --git a/.github/workflows/build-and-test-MACS3-x64.yml b/.github/workflows/build-and-test-MACS3-x64.yml
@@ -8,12 +8,14 @@ on:
     paths-ignore:
       - 'docs/**'
       - '**.md'
+      - 'ChangeLog'
       - '.github/workflows/**'
       - '!.github/workflows/build-and-test-MACS3-x64.yml'
   pull_request:
     paths-ignore:
       - 'docs/**'
       - '**.md'
+      - 'ChangeLog'      
       - '.github/workflows/**'
       - '!.github/workflows/build-and-test-MACS3-x64.yml'
 

diff --git a/ChangeLog b/ChangeLog
@@ -1,5 +1,41 @@
+2024-04-27  Tao Liu  <[email protected]>
+	MACS 3.0.2 (pending)
+
+	* Features added
+
+	1) Introduce a new emission model for the `hmmratac` function. Now
+	users can choose the simpler Poisson emission `--hmm-type poisson`
+	instead of the default Gaussian emission. As a consequence, the
+	saved HMM model file in json will include the hmm-type information
+	as well. Note that in order to be compatible with the HMM model
+	file from previous version, if there is no hmm-type information in
+	the model file, the hmm-type will be assigned as gaussian. #635
+
+	2) Add `--cutoff-analysis-steps` and `--cutoff-analysis-max` for
+	`callpeak`, `bdgpeakcall`, and `hmmratac` so that we can
+	have finer resolution of the cutoff analysis report. #636  #642
+
+	3) Reduce memory usage of `hmmratac` during decoding step, by
+	writing decoding results to a temporary file on disk (file
+	location depends on the environmental TEMP setting), then loading
+	it back while identifying state pathes. This change will decrease
+	the memory usage dramatically. #628 #640
+
+	* Bugs fixed
+
+	1) Use `-O3` instead of `-Ofast` for compatibility. #637
+
+	* Documentation
+
+	1) Update instruction to install macs3 through conda/bioconda
+
+	2) Reorganize MACS3 docs and publish through
+	https://macs3-project.github.io/MACS
+
+	3) Description on various file formats used in MACS3.
+
 2024-02-19  Tao Liu  <[email protected]>
-	MACS 3.0.1
+	MACS 3.0.1 
 
 	* Bugs fixed
 

diff --git a/MACS3/Commands/callvar_cmd.py b/MACS3/Commands/callvar_cmd.py
@@ -1,4 +1,4 @@
-# Time-stamp: <2022-09-15 17:25:49 Tao Liu>
+# Time-stamp: <2024-05-07 11:47:04 Tao Liu>
 
 """Description: Call variants directly
 
@@ -194,7 +194,7 @@ def run( args ):
                 else:
                     ra_collection = RACollection( chrom, peak, tbam.get_reads_in_region( chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate ) )
             except:
-                info ("No reads found in peak: ", chrom.decode(), peak["start"], peak["end"], ". Skipped!")
+                info ("No reads found in this peak. Skipped" )
                 # while there is no reads in peak region, simply skip it.
                 continue
 

diff --git a/MACS3/Commands/hmmratac_cmd.py b/MACS3/Commands/hmmratac_cmd.py
@@ -1,4 +1,4 @@
-# Time-stamp: <2024-04-26 11:47:02 Tao Liu>
+# Time-stamp: <2024-04-26 15:46:03 Tao Liu>
 
 """Description: Main HMMR command
 
@@ -177,18 +177,13 @@ def run( args ):
     if options.pileup_short:
         options.info( f"#  Pile up ONLY short fragments" )
         fc_bdg = digested_atac_signals[ 0 ]
-        (sum_v, n_v, max_v, min_v, mean_v, std_v) = fc_bdg.summary()
-        #print(sum_v, n_v, max_v, min_v, mean_v, std_v)
-        options.info( f"#  Convert pileup to fold-change over average signal" )
-        fc_bdg.apply_func(lambda x: x/mean_v)
     else:
         options.info( f"#  Pile up all fragments" )
         fc_bdg = petrack.pileup_bdg( [1.0,], baseline_value = 0 )
-        (sum_v, n_v, max_v, min_v, mean_v, std_v) = fc_bdg.summary()
-        options.info( f"#  Convert pileup to fold-change over average signal" )
-        fc_bdg.apply_func(lambda x: x/mean_v)
-
-
+    (sum_v, n_v, max_v, min_v, mean_v, std_v) = fc_bdg.summary()
+    options.info( f"#  Convert pileup to fold-change over average signal" )
+    fc_bdg.apply_func(lambda x: x/mean_v)
+
     # if cutoff_analysis only, generate and save the report and quit
     if options.cutoff_analysis_only:
         # we will run cutoff analysis only and quit
@@ -197,11 +192,10 @@ def run( args ):
 
         # Let MACS3 do the cutoff analysis to help decide the lower and upper cutoffs
         with open(cutoffanalysis_file, "w") as ofhd_cutoff:
-            ofhd_cutoff.write( fc_bdg.cutoff_analysis( min_length=minlen, max_gap=options.hmm_training_flanking, max_score = 100 ) )
+            ofhd_cutoff.write( fc_bdg.cutoff_analysis( min_length=minlen, max_gap=options.hmm_training_flanking, max_score = options.cutoff_analysis_max, steps=options.cutoff_analysis_steps ) )
         #raise Exception("Cutoff analysis only.")
         sys.exit(1)
-
-
+
     #############################################
     # 3. Define training set by peak calling
     #############################################
@@ -244,7 +238,7 @@ def run( args ):
 
         # Let MACS3 do the cutoff analysis to help decide the lower and upper cutoffs
         with open(cutoffanalysis_file, "w") as ofhd_cutoff:
-            ofhd_cutoff.write( fc_bdg.cutoff_analysis( min_length=minlen, max_gap=options.hmm_training_flanking, max_score = 100 ) )
+            ofhd_cutoff.write( fc_bdg.cutoff_analysis( min_length=minlen, max_gap=options.hmm_training_flanking, max_score = options.cutoff_analysis_max ) )
 
         # we will check if anything left after filtering
         if peaks.total > options.hmm_maxTrain:

diff --git a/MACS3/Utilities/OptValidator.py b/MACS3/Utilities/OptValidator.py
@@ -1,4 +1,4 @@
-# Time-stamp: <2023-07-28 12:17:28 Tao Liu>
+# Time-stamp: <2024-04-19 15:11:59 Tao Liu>
 
 """Module Description
 
@@ -32,8 +32,6 @@
 
 import logging
 import MACS3.Utilities.Logger
-logger = logging.getLogger(__name__)
-
 # ------------------------------------
 # Misc functions
 # ------------------------------------

diff --git a/README.md b/README.md
@@ -6,12 +6,12 @@
 download](https://img.shields.io/pypi/dm/macs3?label=pypi%20downloads)](https://pypistats.org/packages/macs3)
 
 Latest Release:
-
-* Github: [![Github Release](https://img.shields.io/github/v/release/macs3-project/MACS)](https://github.com/macs3-project/MACS/releases) 
-* PyPI: [![PyPI Release](https://img.shields.io/pypi/v/macs3.svg)![PyPI Python Version](https://img.shields.io/pypi/pyversions/MACS3)![PyPI Format](https://img.shields.io/pypi/format/macs3)](https://pypi.org/project/macs3/) 
-* Anaconda: [![Anaconda-Server Badge](https://anaconda.org/macs3/macs3/badges/version.svg)](https://anaconda.org/macs3/macs3) 
+* Github: [![Github Release](https://img.shields.io/github/v/release/macs3-project/MACS)](https://github.com/macs3-project/MACS/releases)
+* PyPI: [![PyPI Release](https://img.shields.io/pypi/v/macs3.svg)](https://pypi.org/project/MACS3/)
+* Bioconda:[![Bioconda Badge](https://anaconda.org/bioconda/macs3/badges/version.svg)](https://anaconda.org/bioconda/macs3)
 * Debian Med: [![Debian Stable](https://img.shields.io/debian/v/macs/stable?label=debian%20stable)](https://packages.debian.org/stable/macs)[![Debian Unstable](https://img.shields.io/debian/v/macs/sid?label=debian%20sid)](https://packages.debian.org/sid/macs)
 
+
 ## Introduction
 
 With the improvement of sequencing techniques, chromatin
@@ -30,198 +30,8 @@ applied to any "DNA enrichment assays" if the question to be asked is
 simply: *where we can find significant reads coverage than the random
 background*.
 
-## Changes for MACS (3.0.1) 
-
-*Bugs fixed*
-
-1) Fixed a bug that the `hmmatac` can't correctly save the digested
-	signal
-	files. [#605](https://github.com/macs3-project/MACS/issues/605)
-	[#611](https://github.com/macs3-project/MACS/pull/611)
-
-2) Applied a patch to remove cython requirement from the installed
-	system. (it's needed for building the
-	package). [#606](https://github.com/macs3-project/MACS/issues/606)
-	[#612](https://github.com/macs3-project/MACS/pull/612)
-
-3) Relax the testing script while comparing the peaks called from
-	current codes and the standard peaks. To implement this, we added
-	'intersection' function to 'Regions' class to find the
-	intersecting regions of two Regions object (similar to PeakIO but
-	only recording chromosome, start and end positions). And we
-	updated the unit test 'test_Region.py' then implemented a script
-	'jaccard.py' to compute the Jaccard Index of two peak files. If
-	the JI > 0.99 we would think the peaks called and the standard
-	peaks are similar. This is to avoid the problem caused by
-	different Numpy/SciPy/sci-kit learn libraries, when certain peak
-	coordinates may have 10bps
-	difference. [#615](https://github.com/macs3-project/MACS/issues/615)
-	[#619](https://github.com/macs3-project/MACS/pull/619)
-
-4) Due to [the changes in scikit-learn
-	1.3.0](https://scikit-learn.org/1.3/whats_new/v1.3.html), the way
-	hmmlearn 0.3 uses Kmeans will end up with inconsistent results
-	between sklearn <1.3 and sklearn >=1.3. Therefore, we patched the
-	class hmm.GaussianHMM and adjusted the standard output from
-	`hmmratac` subcommand. The change is based on [hmmlearn
-	PR#545](https://github.com/hmmlearn/hmmlearn/pull/545). The idea
-	is to do the random seeding of KMeans 10 times. Now the `hmmratac`
-	results should be more consistent (at least
-	JI>0.99). [#615](https://github.com/macs3-project/MACS/issues/615)
-	[#620](https://github.com/macs3-project/MACS/pull/620)
-
-*Other*
-
-1) We added some dependencies to MACS3. `hmmratc` subcommand needs 
-	`hmmlearn` library, `hmmlearn` needs `scikit-learn` and 
-	`scikit-learn` needs `scipy`. Since major releases have happened 
-	for both`scipy` and `scikit-learn`, we have to set specific 
-	version requirements for them in order to make sure the output 
-	results from `hmmratac` are consistent. 
-
-2) We updated our documentation website using 
-    Sphinx. https://macs3-project.github.io/MACS/
-
-## Changes for MACS (3.0.0)
-
-1) Call variants in peak regions directly from BAM files. The
-	function was originally developed under code name SAPPER. Now
-	SAPPER has been merged into MACS as the `callvar` command. It can
-	be used to call SNVs and small INDELs directly from alignment
-	files for ChIP-seq or ATAC-seq. We call `fermi-lite` to assemble
-	the DNA sequence at the enriched genomic regions (binding sites or
-	accessible DNA) and to refine the alignment when necessary. We
-	added `simde` as	a submodule in order to support fermi-lite
-	library under non-x64 architectures.
-
-2) HMMRATAC module is added as subcommand `hmmratac`. HMMRATAC is a
-	dedicated software to analyze ATAC-seq data. The basic idea behind
-	HMMRATAC is to digest ATAC-seq data according to the fragment
-	length of read pairs into four signal tracks: short fragments,
-	mono-nucleosomal fragments, di-nucleosomal fragments and
-	tri-nucleosomal fragments. Then integrate the four tracks again
-	using Hidden Markov Model to consider three hidden states: open
-	region, nucleosomal region, and background region. The orginal
-	paper was published in 2019 written in JAVA, by Evan Tarbell. We
-	implemented it in Python/Cython and optimize the whole process
-	using existing MACS functions and hmmlearn. Now it can run much
-	faster than the original JAVA version. Note: evaluation of the
-	peak calling results is still underway.
-
-3) Speed/memory optimization.  Use the cykhash to replace python
-	dictionary. Use buffer (10MB) to read and parse input file (not
-	available for BAM file parser). And many optimization tweaks. We
-	added memory monitoring to the runtime messages.
-
-4) R wrappers for MACS -- MACSr for bioconductor.
-
-5) Code cleanup. Reorganize source codes. 
-
-6) Unit testing. 
-
-7) Switch to Github Action for CI, support multi-arch testing
-	including x64, armv7, aarch64, s390x and ppc64le. We also test on
-	Mac OS 12.
-
-8) MACS tag-shifting model has been refined. Now it will use a naive
-	peak calling approach to find ALL possible paired peaks at + and -
-	strand, then use all of them to calculate the
-	cross-correlation. (a related bug has been fix
-	[#442](https://github.com/macs3-project/MACS/issues/442))
-
-9) BAI index and random access to BAM file now is
-	supported. [#449](https://github.com/macs3-project/MACS/issues/449).
-
-10) Support of Python > 3.10 [#498](https://github.com/macs3-project/MACS/issues/498)
-
-11) The effective genome size parameters have been updated
-	according to deeptools. [#508](https://github.com/macs3-project/MACS/issues/508)
-
-12) Multiple updates regarding dependencies, anaconda built, CI/CD
-	process.
-
-13) Cython 3 is supported.
-
-14) Documentations for each subcommand can be found under /docs
-
-*Other*
-
-1) Missing header line while no peaks can be called
-[#501](https://github.com/macs3-project/MACS/issues/501)
-[#502](https://github.com/macs3-project/MACS/issues/502)
-
-2) Note: different numpy, scipy, sklearn may give slightly
-	different results for hmmratac results. The current standard
-	results for automated testing in `/test` directory are from Numpy
-	1.25.1, Scipy 1.11.1, and sklearn 1.3.0.
-
-## Install
-
-The common way to install MACS is through
-[PYPI](https://pypi.org/project/macs3/)). Please check the
-[INSTALL](docs/INSTALL.md) document for detail.
-
-MACS3 has been tested using GitHub Actions for every push and PR in
-the following architectures:
-
- * x86_64 (Ubuntu 22, Python 3.9, 3.10, 3.11, 3.12)
- * aarch64 (Ubuntu 22, Python 3.10)
- * armv7 (Ubuntu 22, Python 3.10)
- * ppc64le (Ubuntu 22, Python 3.10)
- * s390x (Ubuntu 22, Python 3.10)
- * Apple chips (Mac OS 13, Python 3.9, 3.10, 3.11, 3.12)
-
-In general, you can install through PyPI as `pip install macs3`.  To
-use virtual environment is highly recommended. Or you can install
-after unzipping the released package downloaded from Github, then use
-`pip install .` command. Please note that, we haven't tested
-installation on any Windows OS, so currently only Linux and Mac OS
-systems are supported. Also, for aarch64, armv7, ppc64le and s390x,
-due to some unknown reason potentially related to the scientific
-calculation libraries MACS3 depends on, such as Numpy, Scipy,
-hmm-learn, scikit-learn, the results from `hmmratac` subcommand may
-not be consistent with the results from x86 or Apple chips. Please be
-aware.
-
-## Usage
-
-Example for regular peak calling on TF ChIP-seq:
-
-`macs3 callpeak -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01`
-
-Example for broad peak calling on Histone Mark ChIP-seq:
-
-`macs3 callpeak -t ChIP.bam -c Control.bam --broad -g hs --broad-cutoff 0.1`
-
-Example for peak calling on ATAC-seq (paired-end mode):
-
-`macs3 callpeak -f BAMPE -t ATAC.bam -g hs -n test -B -q 0.01`
-
-There are currently 14 functions available in MACS3 serving as
-sub-commands. Please click on the link to see the detail description
-of the subcommands.
-
-Subcommand | Description
------------|----------
-[`callpeak`](docs/callpeak.md) | Main MACS3 Function to call peaks from alignment results.
-[`bdgpeakcall`](docs/bdgpeakcall.md) | Call peaks from bedGraph file.
-[`bdgbroadcall`](docs/bdgbroadcall.md) | Call nested broad peaks from bedGraph file.
-[`bdgcmp`](docs/bdgcmp.md) | Comparing two signal tracks in bedGraph format.
-[`bdgopt`](docs/bdgopt.md) | Operate the score column of bedGraph file.
-[`cmbreps`](docs/cmbreps.md) | Combine bedGraph files of scores from replicates.
-[`bdgdiff`](docs/bdgdiff.md) | Differential peak detection based on paired four bedGraph files.
-[`filterdup`](docs/filterdup.md) | Remove duplicate reads, then save in BED/BEDPE format file.
-[`predictd`](docs/predictd.md) | Predict d or fragment size from alignment results. In case of PE data, report the average insertion/fragment size from all pairs.
-[`pileup`](docs/pileup.md) | Pileup aligned reads (single-end) or fragments (paired-end)
-[`randsample`](docs/randsample.md) | Randomly choose a number/percentage of total reads, then save in BED/BEDPE format file.
-[`refinepeak`](docs/refinepeak.md) | Take raw reads alignment, refine peak summits.
-[`callvar`](docs/callvar.md) | Call variants in given peak regions from the alignment BAM files.
-[`hmmratac`](docs/hmmratac.md) | Dedicated peak calling based on Hidden Markov Model for ATAC-seq data.
-
-For advanced usage, for example, to run `macs3` in a modular way,
-please read the [advanced usage](docs/Advanced_Step-by-step_Peak_Calling.md). There is a
-[Q&A](docs/qa.md) document where we collected some common questions
-from users.
+Please find MACS3 documentations through [MACS3
+website](https://macs3-project.github.io/MACS/).
 
 ## Contribute
 
@@ -245,9 +55,3 @@ contributions over the years.
 2008: [Model-based Analysis of ChIP-Seq
 (MACS)](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-9-r137)
 
-## Other useful links
-
- * [Cistrome](http://cistrome.org/)
- * [bedTools](http://code.google.com/p/bedtools/)
- * [UCSC toolkits](http://hgdownload.cse.ucsc.edu/admin/exe/)
- * [deepTools](https://github.com/deeptools/deepTools/)