Skip to content

Commit

Permalink
paper edits
Browse files Browse the repository at this point in the history
  • Loading branch information
markur4 committed Mar 8, 2024
1 parent bcc4dcf commit 386b1d6
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 33 deletions.
23 changes: 18 additions & 5 deletions paper.bib
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ @misc{charlierTrevismdStatannotationsV02022
year = {2022},
month = oct,
doi = {10.5281/ZENODO.7213391},
url = {https://zenodo.org/record/7213391},
urldate = {2023-11-16},
abstract = {Add scipy's Brunner-Munzel test Fix applying statannotations for non-string group labels (Issue \#65) Get Zenodo DOI},
copyright = {Open Access},
Expand All @@ -24,12 +23,20 @@ @article{hunterMatplotlib2DGraphics2007
pages = {90--95},
issn = {1558-366X},
doi = {10.1109/MCSE.2007.55},
url = {https://ieeexplore.ieee.org/document/4160265},
urldate = {2023-11-15},
abstract = {Matplotlib is a 2D graphics package used for Python for application development, interactive scripting,and publication-quality image generation across user interfaces and operating systems},
file = {/Users/martinkuric/Zotero/storage/W4FJZDNY/§-hunterMatplotlib2DGraphics2007.pdf;/Users/martinkuric/Zotero/storage/GW3HZZHR/4160265.html}
}

@inproceedings{mckinneyDataStructuresStatistical2010,
title = {Data {{Structures}} for {{Statistical Computing}} in {{Python}}},
author = {McKinney, Wes},
year = {2010},
month = jan,
pages = {56--61},
doi = {10.25080/Majora-92bf1922-00a}
}

@article{mckinneyPandasFoundationalPython2011,
title = {Pandas: A {{Foundational Python Library}} for {{Data Analysis}} and {{Statistics}}},
shorttitle = {Pandas},
Expand All @@ -41,6 +48,15 @@ @article{mckinneyPandasFoundationalPython2011
file = {/Users/martinkuric/Zotero/storage/IH5C5UZ3/§-mckinneyPandasFoundationalPython2011.pdf}
}

@misc{reback2020pandas,
title = {Pandas-Dev/Pandas: {{Pandas}}},
author = {pandas development {team}, The},
year = {2020},
month = feb,
doi = {10.5281/zenodo.3509134},
howpublished = {Zenodo}
}

@article{vallatPingouinStatisticsPython2018,
title = {Pingouin: Statistics in {{Python}}},
shorttitle = {Pingouin},
Expand All @@ -53,7 +69,6 @@ @article{vallatPingouinStatisticsPython2018
pages = {1026},
issn = {2475-9066},
doi = {10.21105/joss.01026},
url = {https://joss.theoj.org/papers/10.21105/joss.01026},
urldate = {2023-05-29},
abstract = {Vallat, (2018). Pingouin: statistics in Python. Journal of Open Source Software, 3(31), 1026, https://doi.org/10.21105/joss.01026},
langid = {english},
Expand All @@ -72,7 +87,6 @@ @article{waskomSeabornStatisticalData2021
pages = {3021},
issn = {2475-9066},
doi = {10.21105/joss.03021},
url = {https://joss.theoj.org/papers/10.21105/joss.03021},
urldate = {2023-03-26},
abstract = {Waskom, M. L., (2021). seaborn: statistical data visualization. Journal of Open Source Software, 6(60), 3021, https://doi.org/10.21105/joss.03021},
langid = {english},
Expand All @@ -89,7 +103,6 @@ @article{wickhamTidyData2014a
pages = {1--23},
issn = {1548-7660},
doi = {10.18637/jss.v059.i10},
url = {https://doi.org/10.18637/jss.v059.i10},
urldate = {2023-11-15},
abstract = {A huge amount of effort is spent cleaning data to get it ready for analysis, but there has been little research on how to make data cleaning as easy and effective as possible. This paper tackles a small, but important, component of data cleaning: data tidying. Tidy datasets are easy to manipulate, model and visualize, and have a specific structure: each variable is a column, each observation is a row, and each type of observational unit is a table. This framework makes it easy to tidy messy datasets because only a small set of tools are needed to deal with a wide range of un-tidy datasets. This structure also makes it easier to develop tidy tools for data analysis, tools that both input and output tidy datasets. The advantages of a consistent data structure and matching tools are demonstrated with a case study free from mundane data manipulation chores.},
copyright = {Copyright (c) 2013 Hadley Wickham},
Expand Down
55 changes: 27 additions & 28 deletions paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,50 +29,50 @@ aas-doi: 10.3847/xxxxx <- update this with the DOI from AAS once you know it.
# aas-journal: Astrophysical Journal <- The name of the AAS journal.
---


# Summary

`plotastic` addresses the challenges of transitioning from exploratory
data analysis to hypothesis testing in Python's data science ecosystem.
Bridging the gap between `seaborn` and `pingouin`, this library offers a
unified environment for plotting and statistical analysis. It simplifies
the workflow with a user-friendly syntax and seamless integration with
the workflow with user-friendly syntax and seamless integration with
familiar `seaborn` parameters (y, x, hue, row, col). Inspired by
`seaborn`'s consistency, `plotastic` utilizes a `DataAnalysis` object to
intelligently pass parameters to `pingouin` statistical functions. The
library systematically groups the data according to the needs of
statistical tests and plots, conducts visualisation, analyses and
supports extensive customization options. In essence, `plotastic`
establishes a protocol for configuring statical analyses through
plotting parameters. This approach streamlines the process, translating
`seaborn` parameters into statistical terms, allowing researchers to
focus on correct statistical testing and less about specific syntax and
intelligently pass parameters to `pingouin` statistical functions.
Hence, statistics and plotting are performed on the same set of
parameters, so that the strength of `seaborn` in visualizing
multidimensional data is extended onto statistical analysis. In essence,
`plotastic` translates `seaborn` parameters into statistical terms,
configures statistical protocols based on intuitive plotting syntax and
returns a `matplotlib` figure with known customization options and more.
This approach streamlines data analysis, allowing researchers to focus
on correct statistical testing and less about specific syntax and
implementations.



# Statement of need

Python's data science ecosystem provides powerful tools for both
visualization and statistical testing. However, the transition from
exploratory data analysis to hypothesis testing can be cumbersome,
requiring users to switch between libraries and adapt to different
syntaxes.

`seaborn` has become a popular choice for plotting in Python, offering
an intuitive interface. Its statistical functionality focuses on
descriptive plots and bootstrapped confidence intervals

syntaxes. `seaborn` has become a popular choice for plotting in Python,
offering an intuitive interface. Its statistical functionality focuses
on descriptive plots and bootstrapped confidence intervals
[@waskomSeabornStatisticalData2021]. The library `pingouin` offers an
extensive set of statistical tests, but it lacks integration with common
plotting capabilities [@vallatPingouinStatisticsPython2018].
`statannotations` integrates statistical testing with plot annotations, but
uses a complex interface and is limited to pairwise comparisons
`statannotations` integrates statistical testing with plot annotations,
but uses a complex interface and is limited to pairwise comparisons
[@charlierTrevismdStatannotationsV02022].

`plotastic` addresses this gap by offering a unified environment for
plotting and statistical analysis. With an emphasis on user-friendly
syntax and integration with familiar `seaborn` parameters, it simplifies
the process for users already comfortable `seaborn`. The library ensures
a smooth workflow, from data import to hypothesis testing and
syntax and integration of familiar `seaborn` parameters, it simplifies
the process for users already comfortable with `seaborn`. The library
ensures a smooth workflow, from data import to hypothesis testing and
visualization.

# Example
Expand Down Expand Up @@ -169,11 +169,10 @@ of statistical analysis and plotting, leveraging the capabilities of
@charlierTrevismdStatannotationsV02022]. It utilizes long-format
`pandas` `DataFrames` as its primary input, aligning with the
conventions of `seaborn` and ensuring compatibility with existing data
structures [@wickhamTidyData2014a;
@mckinneyPandasFoundationalPython2011].
structures [@wickhamTidyData2014a; @reback2020pandas; @mckinneyDataStructuresStatistical2010].

`plotastic` was inspired by `seaborn`'s intuitive and consistent usage
of the same set of parameters (y, x, hue, row, col) found in each of its
`plotastic` was inspired by `seaborn` using the same set of intuitive
and consistent parameters (y, x, hue, row, col) found in each of its
plotting functions [@waskomSeabornStatisticalData2021]. These parameters
intuitively delineate the data dimensions plotted, yielding 'facetted'
subplots, each presenting y against x. This allows for rapid and
Expand Down Expand Up @@ -208,24 +207,24 @@ on the x-axis, which is achieved by splitting the data by all grouping
dimensions (hue, row, col) (\autoref{tab:sphericity}). For omnibus and
posthoc analyses, data is grouped by the row and col dimensions in
parallel to the `matplotlib` axes, before performing one two-factor
analysis per axes using x and hue as the within/between-factors.
analysis per axis using x and hue as the within/between-factors.
(\autoref{tab:RMANOVA}).

`DataAnalysis` visualizes data through predefined plotting functions
designed for drawing multi-layered plots. A notable emphasis within
`plotastic` is placed on showcasing individual datapoints alongside
aggregated means or medians. In detail, each plotting function
initializes `matplotlib` figure and axes using `plt.subplots()` while
initializes a `matplotlib` figure and axes using `plt.subplots()` while
returning a `DataAnalysis` object for method chaining. Axes are
populated by `seaborn` plotting functions (e.g., `sns.boxplot()`),
leveraging automated aggregation and error bar displays. Keyword
arguments are passed to these `seaborn` functions, ensuring the same
degree of customization as in seaborn. Users can further customize plots
by chaining `DataAnalysis` methods or by applying common `matplotlib` code
to override plotastic settings. Figures are exported using
to override `plotastic` settings. Figures are exported using
`plt.savefig()`.

`plotastic` also focuses on in annotating statistical information within
`plotastic` also focuses on annotating statistical information within
plots, seamlessly incorporating p-values from pairwise comparisons using
`statannotations` [@charlierTrevismdStatannotationsV02022]. This
integration simplifies the interface and expands options for pair
Expand Down
Binary file modified paper.pdf
Binary file not shown.

0 comments on commit 386b1d6

Please sign in to comment.