# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.14.1
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# # Index of ML Operations<a id='top_phases'></a>
# <ul>
# <ul><li><details><summary><h2>Imported Libraries</h2></summary>
# <ul>
#
# <li><b>keras</b></li>
# <li><b>matplotlib</b></li>
# <li><b>numpy</b></li>
# <li><b>os</b></li>
# <li><b>pandas</b></li>
# <li><b>seaborn</b></li>
# <li><b>sklearn</b></li>
#
# </ul>
# </details></li></ul>
# <ul><li><details><summary><h2>Visualization</h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Visualization" Calls</u></b></summary>
# <ul>
#
# <li> <b>seaborn</b>
# <ul>
# <li>
# <details><summary><u>seaborn.distributions.histplot</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Plot univariate or bivariate histograms to show distributions of datasets.
#
# A histogram is a classic visualization tool that represents the distribution
# of one or more variables by counting the number of observations that fall within
# disrete bins.
#
# This function can normalize the statistic computed within each bin to estimate
# frequency, density or probability mass, and it can add a smooth curve obtained
# using a kernel density estimate, similar to :func:`kdeplot`.
#
# More information is provided in the :ref:`user guide <tutorial_hist>`.
#
# Parameters
# ----------
# data : :class:`pandas.DataFrame`, :class:`numpy.ndarray`, mapping, or sequence
#     Input data structure. Either a long-form collection of vectors that can be
#     assigned to named variables or a wide-form dataset that will be internally
#     reshaped.
# x, y : vectors or keys in ``data``
#     Variables that specify positions on the x and y axes.
# hue : vector or key in ``data``
#     Semantic variable that is mapped to determine the color of plot elements.
# weights : vector or key in ``data``
#     If provided, weight the contribution of the corresponding data points
#     towards the count in each bin by these factors.
# stat : str
#     Aggregate statistic to compute in each bin.
#     
#     - `count`: show the number of observations in each bin
#     - `frequency`: show the number of observations divided by the bin width
#     - `probability`: or `proportion`: normalize such that bar heights sum to 1
#     - `percent`: normalize such that bar heights sum to 100
#     - `density`: normalize such that the total area of the histogram equals 1
# bins : str, number, vector, or a pair of such values
#     Generic bin parameter that can be the name of a reference rule,
#     the number of bins, or the breaks of the bins.
#     Passed to :func:`numpy.histogram_bin_edges`.
# binwidth : number or pair of numbers
#     Width of each bin, overrides ``bins`` but can be used with
#     ``binrange``.
# binrange : pair of numbers or a pair of pairs
#     Lowest and highest value for bin edges; can be used either
#     with ``bins`` or ``binwidth``. Defaults to data extremes.
# discrete : bool
#     If True, default to ``binwidth=1`` and draw the bars so that they are
#     centered on their corresponding data points. This avoids "gaps" that may
#     otherwise appear when using discrete (integer) data.
# cumulative : bool
#     If True, plot the cumulative counts as bins increase.
# common_bins : bool
#     If True, use the same bins when semantic variables produce multiple
#     plots. If using a reference rule to determine the bins, it will be computed
#     with the full dataset.
# common_norm : bool
#     If True and using a normalized statistic, the normalization will apply over
#     the full dataset. Otherwise, normalize each histogram independently.
# multiple : {"layer", "dodge", "stack", "fill"}
#     Approach to resolving multiple elements when semantic mapping creates subsets.
#     Only relevant with univariate data.
# element : {"bars", "step", "poly"}
#     Visual representation of the histogram statistic.
#     Only relevant with univariate data.
# fill : bool
#     If True, fill in the space under the histogram.
#     Only relevant with univariate data.
# shrink : number
#     Scale the width of each bar relative to the binwidth by this factor.
#     Only relevant with univariate data.
# kde : bool
#     If True, compute a kernel density estimate to smooth the distribution
#     and show on the plot as (one or more) line(s).
#     Only relevant with univariate data.
# kde_kws : dict
#     Parameters that control the KDE computation, as in :func:`kdeplot`.
# line_kws : dict
#     Parameters that control the KDE visualization, passed to
#     :meth:`matplotlib.axes.Axes.plot`.
# thresh : number or None
#     Cells with a statistic less than or equal to this value will be transparent.
#     Only relevant with bivariate data.
# pthresh : number or None
#     Like ``thresh``, but a value in [0, 1] such that cells with aggregate counts
#     (or other statistics, when used) up to this proportion of the total will be
#     transparent.
# pmax : number or None
#     A value in [0, 1] that sets that saturation point for the colormap at a value
#     such that cells below is constistute this proportion of the total count (or
#     other statistic, when used).
# cbar : bool
#     If True, add a colorbar to annotate the color mapping in a bivariate plot.
#     Note: Does not currently support plots with a ``hue`` variable well.
# cbar_ax : :class:`matplotlib.axes.Axes`
#     Pre-existing axes for the colorbar.
# cbar_kws : dict
#     Additional parameters passed to :meth:`matplotlib.figure.Figure.colorbar`.
# palette : string, list, dict, or :class:`matplotlib.colors.Colormap`
#     Method for choosing the colors to use when mapping the ``hue`` semantic.
#     String values are passed to :func:`color_palette`. List or dict values
#     imply categorical mapping, while a colormap object implies numeric mapping.
# hue_order : vector of strings
#     Specify the order of processing and plotting for categorical levels of the
#     ``hue`` semantic.
# hue_norm : tuple or :class:`matplotlib.colors.Normalize`
#     Either a pair of values that set the normalization range in data units
#     or an object that will map from data units into a [0, 1] interval. Usage
#     implies numeric mapping.
# color : :mod:`matplotlib color <matplotlib.colors>`
#     Single color specification for when hue mapping is not used. Otherwise, the
#     plot will try to hook into the matplotlib property cycle.
# log_scale : bool or number, or pair of bools or numbers
#     Set axis scale(s) to log. A single value sets the data axis for univariate
#     distributions and both axes for bivariate distributions. A pair of values
#     sets each axis independently. Numeric values are interpreted as the desired
#     base (default 10). If `False`, defer to the existing Axes scale.
# legend : bool
#     If False, suppress the legend for semantic variables.
# ax : :class:`matplotlib.axes.Axes`
#     Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca`
#     internally.
# kwargs
#     Other keyword arguments are passed to one of the following matplotlib
#     functions:
#
#     - :meth:`matplotlib.axes.Axes.bar` (univariate, element="bars")
#     - :meth:`matplotlib.axes.Axes.fill_between` (univariate, other element, fill=True)
#     - :meth:`matplotlib.axes.Axes.plot` (univariate, other element, fill=False)
#     - :meth:`matplotlib.axes.Axes.pcolormesh` (bivariate)
#
# Returns
# -------
# :class:`matplotlib.axes.Axes`
#     The matplotlib axes containing the plot.
#
# See Also
# --------
# displot : Figure-level interface to distribution plot functions.
# kdeplot : Plot univariate or bivariate distributions using kernel density estimation.
# rugplot : Plot a tick at each observation value along the x and/or y axes.
# ecdfplot : Plot empirical cumulative distribution functions.
# jointplot : Draw a bivariate plot with univariate marginal distributions.
#
# Notes
# -----
#
# The choice of bins for computing and plotting a histogram can exert
# substantial influence on the insights that one is able to draw from the
# visualization. If the bins are too large, they may erase important features.
# On the other hand, bins that are too small may be dominated by random
# variability, obscuring the shape of the true underlying distribution. The
# default bin size is determined using a reference rule that depends on the
# sample size and variance. This works well in many cases, (i.e., with
# "well-behaved" data) but it fails in others. It is always a good to try
# different bin sizes to be sure that you are not missing something important.
# This function allows you to specify bins in several different ways, such as
# by setting the total number of bins to use, the width of each bin, or the
# specific locations where the bins should break.
#
# Examples
# --------
#
# .. include:: ../docstrings/histplot.rst
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 33</u></h3></summary><small><a href=#33>goto cell # 33</a></small>
# <ul>
#
# <li> <b>seaborn</b>
# <ul>
# <li>
# <details><summary><u>seaborn.distributions.histplot</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Plot univariate or bivariate histograms to show distributions of datasets.
#
# A histogram is a classic visualization tool that represents the distribution
# of one or more variables by counting the number of observations that fall within
# disrete bins.
#
# This function can normalize the statistic computed within each bin to estimate
# frequency, density or probability mass, and it can add a smooth curve obtained
# using a kernel density estimate, similar to :func:`kdeplot`.
#
# More information is provided in the :ref:`user guide <tutorial_hist>`.
#
# Parameters
# ----------
# data : :class:`pandas.DataFrame`, :class:`numpy.ndarray`, mapping, or sequence
#     Input data structure. Either a long-form collection of vectors that can be
#     assigned to named variables or a wide-form dataset that will be internally
#     reshaped.
# x, y : vectors or keys in ``data``
#     Variables that specify positions on the x and y axes.
# hue : vector or key in ``data``
#     Semantic variable that is mapped to determine the color of plot elements.
# weights : vector or key in ``data``
#     If provided, weight the contribution of the corresponding data points
#     towards the count in each bin by these factors.
# stat : str
#     Aggregate statistic to compute in each bin.
#     
#     - `count`: show the number of observations in each bin
#     - `frequency`: show the number of observations divided by the bin width
#     - `probability`: or `proportion`: normalize such that bar heights sum to 1
#     - `percent`: normalize such that bar heights sum to 100
#     - `density`: normalize such that the total area of the histogram equals 1
# bins : str, number, vector, or a pair of such values
#     Generic bin parameter that can be the name of a reference rule,
#     the number of bins, or the breaks of the bins.
#     Passed to :func:`numpy.histogram_bin_edges`.
# binwidth : number or pair of numbers
#     Width of each bin, overrides ``bins`` but can be used with
#     ``binrange``.
# binrange : pair of numbers or a pair of pairs
#     Lowest and highest value for bin edges; can be used either
#     with ``bins`` or ``binwidth``. Defaults to data extremes.
# discrete : bool
#     If True, default to ``binwidth=1`` and draw the bars so that they are
#     centered on their corresponding data points. This avoids "gaps" that may
#     otherwise appear when using discrete (integer) data.
# cumulative : bool
#     If True, plot the cumulative counts as bins increase.
# common_bins : bool
#     If True, use the same bins when semantic variables produce multiple
#     plots. If using a reference rule to determine the bins, it will be computed
#     with the full dataset.
# common_norm : bool
#     If True and using a normalized statistic, the normalization will apply over
#     the full dataset. Otherwise, normalize each histogram independently.
# multiple : {"layer", "dodge", "stack", "fill"}
#     Approach to resolving multiple elements when semantic mapping creates subsets.
#     Only relevant with univariate data.
# element : {"bars", "step", "poly"}
#     Visual representation of the histogram statistic.
#     Only relevant with univariate data.
# fill : bool
#     If True, fill in the space under the histogram.
#     Only relevant with univariate data.
# shrink : number
#     Scale the width of each bar relative to the binwidth by this factor.
#     Only relevant with univariate data.
# kde : bool
#     If True, compute a kernel density estimate to smooth the distribution
#     and show on the plot as (one or more) line(s).
#     Only relevant with univariate data.
# kde_kws : dict
#     Parameters that control the KDE computation, as in :func:`kdeplot`.
# line_kws : dict
#     Parameters that control the KDE visualization, passed to
#     :meth:`matplotlib.axes.Axes.plot`.
# thresh : number or None
#     Cells with a statistic less than or equal to this value will be transparent.
#     Only relevant with bivariate data.
# pthresh : number or None
#     Like ``thresh``, but a value in [0, 1] such that cells with aggregate counts
#     (or other statistics, when used) up to this proportion of the total will be
#     transparent.
# pmax : number or None
#     A value in [0, 1] that sets that saturation point for the colormap at a value
#     such that cells below is constistute this proportion of the total count (or
#     other statistic, when used).
# cbar : bool
#     If True, add a colorbar to annotate the color mapping in a bivariate plot.
#     Note: Does not currently support plots with a ``hue`` variable well.
# cbar_ax : :class:`matplotlib.axes.Axes`
#     Pre-existing axes for the colorbar.
# cbar_kws : dict
#     Additional parameters passed to :meth:`matplotlib.figure.Figure.colorbar`.
# palette : string, list, dict, or :class:`matplotlib.colors.Colormap`
#     Method for choosing the colors to use when mapping the ``hue`` semantic.
#     String values are passed to :func:`color_palette`. List or dict values
#     imply categorical mapping, while a colormap object implies numeric mapping.
# hue_order : vector of strings
#     Specify the order of processing and plotting for categorical levels of the
#     ``hue`` semantic.
# hue_norm : tuple or :class:`matplotlib.colors.Normalize`
#     Either a pair of values that set the normalization range in data units
#     or an object that will map from data units into a [0, 1] interval. Usage
#     implies numeric mapping.
# color : :mod:`matplotlib color <matplotlib.colors>`
#     Single color specification for when hue mapping is not used. Otherwise, the
#     plot will try to hook into the matplotlib property cycle.
# log_scale : bool or number, or pair of bools or numbers
#     Set axis scale(s) to log. A single value sets the data axis for univariate
#     distributions and both axes for bivariate distributions. A pair of values
#     sets each axis independently. Numeric values are interpreted as the desired
#     base (default 10). If `False`, defer to the existing Axes scale.
# legend : bool
#     If False, suppress the legend for semantic variables.
# ax : :class:`matplotlib.axes.Axes`
#     Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca`
#     internally.
# kwargs
#     Other keyword arguments are passed to one of the following matplotlib
#     functions:
#
#     - :meth:`matplotlib.axes.Axes.bar` (univariate, element="bars")
#     - :meth:`matplotlib.axes.Axes.fill_between` (univariate, other element, fill=True)
#     - :meth:`matplotlib.axes.Axes.plot` (univariate, other element, fill=False)
#     - :meth:`matplotlib.axes.Axes.pcolormesh` (bivariate)
#
# Returns
# -------
# :class:`matplotlib.axes.Axes`
#     The matplotlib axes containing the plot.
#
# See Also
# --------
# displot : Figure-level interface to distribution plot functions.
# kdeplot : Plot univariate or bivariate distributions using kernel density estimation.
# rugplot : Plot a tick at each observation value along the x and/or y axes.
# ecdfplot : Plot empirical cumulative distribution functions.
# jointplot : Draw a bivariate plot with univariate marginal distributions.
#
# Notes
# -----
#
# The choice of bins for computing and plotting a histogram can exert
# substantial influence on the insights that one is able to draw from the
# visualization. If the bins are too large, they may erase important features.
# On the other hand, bins that are too small may be dominated by random
# variability, obscuring the shape of the true underlying distribution. The
# default bin size is determined using a reference rule that depends on the
# sample size and variance. This works well in many cases, (i.e., with
# "well-behaved" data) but it fails in others. It is always a good to try
# different bin sizes to be sure that you are not missing something important.
# This function allows you to specify bins in several different ways, such as
# by setting the total number of bins to use, the width of each bin, or the
# specific locations where the bins should break.
#
# Examples
# --------
#
# .. include:: ../docstrings/histplot.rst
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li></ul>
# <li><details><summary><h2><span style='color:#42a5f5'>Data Preparation</span></h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Data Preparation" Calls</u></b></summary>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [[]] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.mean</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the mean of the values over the requested axis.
#
# Parameters
# ----------
# axis : {index (0)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a scalar.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# scalar or Series (if level specified)
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.notnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Series.notnull is an alias for Series.notna.
#
# Detect existing (non-missing) values.
#
# Return a boolean same-sized object indicating if the values are not NA.
# Non-missing values get mapped to True. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
# NA values, such as None or :attr:`numpy.NaN`, get mapped to False
# values.
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is not an NA value.
#
# See Also
# --------
# Series.notnull : Alias of notna.
# Series.isna : Boolean inverse of notna.
# Series.dropna : Omit axes labels with missing values.
# notna : Top-level notna.
#
# Examples
# --------
# Show which entries in a DataFrame are not NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.notna()
#      age   born  name    toy
# 0   True  False  True  False
# 1   True   True  True   True
# 2  False   True  True   True
#
# Show which entries in a Series are not NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.notna()
# 0     True
# 1     True
# 2    False
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>sklearn</b>
# <ul>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Transform features by scaling each feature to a given range.
#
# This estimator scales and translates each feature individually such
# that it is in the given range on the training set, e.g. between
# zero and one.
#
# The transformation is given by::
#
#     X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
#     X_scaled = X_std * (max - min) + min
#
# where min, max = feature_range.
#
# This transformation is often used as an alternative to zero mean,
# unit variance scaling.
#
# Read more in the :ref:`User Guide <preprocessing_scaler>`.
#
# Parameters
# ----------
# feature_range : tuple (min, max), default=(0, 1)
#     Desired range of transformed data.
#
# copy : bool, default=True
#     Set to False to perform inplace row normalization and avoid a
#     copy (if the input is already a numpy array).
#
# clip : bool, default=False
#     Set to True to clip transformed values of held-out data to
#     provided `feature range`.
#
#     .. versionadded:: 0.24
#
# Attributes
# ----------
# min_ : ndarray of shape (n_features,)
#     Per feature adjustment for minimum. Equivalent to
#     ``min - X.min(axis=0) * self.scale_``
#
# scale_ : ndarray of shape (n_features,)
#     Per feature relative scaling of the data. Equivalent to
#     ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
#
#     .. versionadded:: 0.17
#        *scale_* attribute.
#
# data_min_ : ndarray of shape (n_features,)
#     Per feature minimum seen in the data
#
#     .. versionadded:: 0.17
#        *data_min_*
#
# data_max_ : ndarray of shape (n_features,)
#     Per feature maximum seen in the data
#
#     .. versionadded:: 0.17
#        *data_max_*
#
# data_range_ : ndarray of shape (n_features,)
#     Per feature range ``(data_max_ - data_min_)`` seen in the data
#
#     .. versionadded:: 0.17
#        *data_range_*
#
# n_features_in_ : int
#     Number of features seen during :term:`fit`.
#
#     .. versionadded:: 0.24
#
# n_samples_seen_ : int
#     The number of samples processed by the estimator.
#     It will be reset on new calls to fit, but increments across
#     ``partial_fit`` calls.
#
# feature_names_in_ : ndarray of shape (`n_features_in_`,)
#     Names of features seen during :term:`fit`. Defined only when `X`
#     has feature names that are all strings.
#
#     .. versionadded:: 1.0
#
# See Also
# --------
# minmax_scale : Equivalent function without the estimator API.
#
# Notes
# -----
# NaNs are treated as missing values: disregarded in fit, and maintained in
# transform.
#
# For a comparison of the different scalers, transformers, and normalizers,
# see :ref:`examples/preprocessing/plot_all_scaling.py
# <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
#
# Examples
# --------
# >>> from sklearn.preprocessing import MinMaxScaler
# >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
# >>> scaler = MinMaxScaler()
# >>> print(scaler.fit(data))
# MinMaxScaler()
# >>> print(scaler.data_max_)
# [ 1. 18.]
# >>> print(scaler.transform(data))
# [[0.   0.  ]
#  [0.25 0.25]
#  [0.5  0.5 ]
#  [1.   1.  ]]
# >>> print(scaler.transform([[2, 2]]))
# [[1.5 0. ]]
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.base.TransformerMixin.fit_transform</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Fit to data, then transform it.
#
# Fits transformer to `X` and `y` with optional parameters `fit_params`
# and returns a transformed version of `X`.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input samples.
#
# y :  array-like of shape (n_samples,) or (n_samples, n_outputs),                 default=None
#     Target values (None for unsupervised transformations).
#
# **fit_params : dict
#     Additional fit parameters.
#
# Returns
# -------
# X_new : ndarray array of shape (n_samples, n_features_new)
#     Transformed array.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler.transform</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Scale features of X according to feature_range.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input data that will be transformed.
#
# Returns
# -------
# Xt : ndarray of shape (n_samples, n_features)
#     Transformed data.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>numpy</b>
# <ul>
# <li>
# <details><summary><u>numpy.ndarray.flatten</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# a.flatten(order='C')
#
# Return a copy of the array collapsed into one dimension.
#
# Parameters
# ----------
# order : {'C', 'F', 'A', 'K'}, optional
#     'C' means to flatten in row-major (C-style) order.
#     'F' means to flatten in column-major (Fortran-
#     style) order. 'A' means to flatten in column-major
#     order if `a` is Fortran *contiguous* in memory,
#     row-major order otherwise. 'K' means to flatten
#     `a` in the order the elements occur in memory.
#     The default is 'C'.
#
# Returns
# -------
# y : ndarray
#     A copy of the input array, flattened to one dimension.
#
# See Also
# --------
# ravel : Return a flattened array.
# flat : A 1-D flat iterator over the array.
#
# Examples
# --------
# >>> a = np.array([[1,2], [3,4]])
# >>> a.flatten()
# array([1, 2, 3, 4])
# >>> a.flatten('F')
# array([1, 3, 2, 4])
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 21</u></h3></summary><small><a href=#21>goto cell # 21</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 25</u></h3></summary><small><a href=#25>goto cell # 25</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 32</u></h3></summary><small><a href=#32>goto cell # 32</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [[]] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 34</u></h3></summary><small><a href=#34>goto cell # 34</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [[]] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.mean</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the mean of the values over the requested axis.
#
# Parameters
# ----------
# axis : {index (0)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a scalar.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# scalar or Series (if level specified)
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 35</u></h3></summary><small><a href=#35>goto cell # 35</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.notnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Series.notnull is an alias for Series.notna.
#
# Detect existing (non-missing) values.
#
# Return a boolean same-sized object indicating if the values are not NA.
# Non-missing values get mapped to True. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
# NA values, such as None or :attr:`numpy.NaN`, get mapped to False
# values.
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is not an NA value.
#
# See Also
# --------
# Series.notnull : Alias of notna.
# Series.isna : Boolean inverse of notna.
# Series.dropna : Omit axes labels with missing values.
# notna : Top-level notna.
#
# Examples
# --------
# Show which entries in a DataFrame are not NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.notna()
#      age   born  name    toy
# 0   True  False  True  False
# 1   True   True  True   True
# 2  False   True  True   True
#
# Show which entries in a Series are not NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.notna()
# 0     True
# 1     True
# 2    False
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 37</u></h3></summary><small><a href=#37>goto cell # 37</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 42</u></h3></summary><small><a href=#42>goto cell # 42</a></small>
# <ul>
#
# <li> <b>sklearn</b>
# <ul>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Transform features by scaling each feature to a given range.
#
# This estimator scales and translates each feature individually such
# that it is in the given range on the training set, e.g. between
# zero and one.
#
# The transformation is given by::
#
#     X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
#     X_scaled = X_std * (max - min) + min
#
# where min, max = feature_range.
#
# This transformation is often used as an alternative to zero mean,
# unit variance scaling.
#
# Read more in the :ref:`User Guide <preprocessing_scaler>`.
#
# Parameters
# ----------
# feature_range : tuple (min, max), default=(0, 1)
#     Desired range of transformed data.
#
# copy : bool, default=True
#     Set to False to perform inplace row normalization and avoid a
#     copy (if the input is already a numpy array).
#
# clip : bool, default=False
#     Set to True to clip transformed values of held-out data to
#     provided `feature range`.
#
#     .. versionadded:: 0.24
#
# Attributes
# ----------
# min_ : ndarray of shape (n_features,)
#     Per feature adjustment for minimum. Equivalent to
#     ``min - X.min(axis=0) * self.scale_``
#
# scale_ : ndarray of shape (n_features,)
#     Per feature relative scaling of the data. Equivalent to
#     ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
#
#     .. versionadded:: 0.17
#        *scale_* attribute.
#
# data_min_ : ndarray of shape (n_features,)
#     Per feature minimum seen in the data
#
#     .. versionadded:: 0.17
#        *data_min_*
#
# data_max_ : ndarray of shape (n_features,)
#     Per feature maximum seen in the data
#
#     .. versionadded:: 0.17
#        *data_max_*
#
# data_range_ : ndarray of shape (n_features,)
#     Per feature range ``(data_max_ - data_min_)`` seen in the data
#
#     .. versionadded:: 0.17
#        *data_range_*
#
# n_features_in_ : int
#     Number of features seen during :term:`fit`.
#
#     .. versionadded:: 0.24
#
# n_samples_seen_ : int
#     The number of samples processed by the estimator.
#     It will be reset on new calls to fit, but increments across
#     ``partial_fit`` calls.
#
# feature_names_in_ : ndarray of shape (`n_features_in_`,)
#     Names of features seen during :term:`fit`. Defined only when `X`
#     has feature names that are all strings.
#
#     .. versionadded:: 1.0
#
# See Also
# --------
# minmax_scale : Equivalent function without the estimator API.
#
# Notes
# -----
# NaNs are treated as missing values: disregarded in fit, and maintained in
# transform.
#
# For a comparison of the different scalers, transformers, and normalizers,
# see :ref:`examples/preprocessing/plot_all_scaling.py
# <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
#
# Examples
# --------
# >>> from sklearn.preprocessing import MinMaxScaler
# >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
# >>> scaler = MinMaxScaler()
# >>> print(scaler.fit(data))
# MinMaxScaler()
# >>> print(scaler.data_max_)
# [ 1. 18.]
# >>> print(scaler.transform(data))
# [[0.   0.  ]
#  [0.25 0.25]
#  [0.5  0.5 ]
#  [1.   1.  ]]
# >>> print(scaler.transform([[2, 2]]))
# [[1.5 0. ]]
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.base.TransformerMixin.fit_transform</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Fit to data, then transform it.
#
# Fits transformer to `X` and `y` with optional parameters `fit_params`
# and returns a transformed version of `X`.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input samples.
#
# y :  array-like of shape (n_samples,) or (n_samples, n_outputs),                 default=None
#     Target values (None for unsupervised transformations).
#
# **fit_params : dict
#     Additional fit parameters.
#
# Returns
# -------
# X_new : ndarray array of shape (n_samples, n_features_new)
#     Transformed array.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler.transform</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Scale features of X according to feature_range.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input data that will be transformed.
#
# Returns
# -------
# Xt : ndarray of shape (n_samples, n_features)
#     Transformed data.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 48</u></h3></summary><small><a href=#48>goto cell # 48</a></small>
# <ul>
#
# <li> <b>numpy</b>
# <ul>
# <li>
# <details><summary><u>numpy.ndarray.flatten</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# a.flatten(order='C')
#
# Return a copy of the array collapsed into one dimension.
#
# Parameters
# ----------
# order : {'C', 'F', 'A', 'K'}, optional
#     'C' means to flatten in row-major (C-style) order.
#     'F' means to flatten in column-major (Fortran-
#     style) order. 'A' means to flatten in column-major
#     order if `a` is Fortran *contiguous* in memory,
#     row-major order otherwise. 'K' means to flatten
#     `a` in the order the elements occur in memory.
#     The default is 'C'.
#
# Returns
# -------
# y : ndarray
#     A copy of the input array, flattened to one dimension.
#
# See Also
# --------
# ravel : Return a flattened array.
# flat : A 1-D flat iterator over the array.
#
# Examples
# --------
# >>> a = np.array([[1,2], [3,4]])
# >>> a.flatten()
# array([1, 2, 3, 4])
# >>> a.flatten('F')
# array([1, 3, 2, 4])
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 50</u></h3></summary><small><a href=#50>goto cell # 50</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [[]] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li>
# <ul><li><details><summary><h2>Data Profiling and Exploratory Data Analysis</h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Data Profiling and Exploratory Data Analysis" Calls</u></b></summary>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.isin</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Whether elements in Series are contained in `values`.
#
# Return a boolean Series showing whether each element in the Series
# matches an element in the passed sequence of `values` exactly.
#
# Parameters
# ----------
# values : set or list-like
#     The sequence of values to test. Passing in a single string will
#     raise a ``TypeError``. Instead, turn a single string into a
#     list of one element.
#
# Returns
# -------
# Series
#     Series of booleans indicating if each element is in values.
#
# Raises
# ------
# TypeError
#   * If `values` is a string
#
# See Also
# --------
# DataFrame.isin : Equivalent method on DataFrame.
#
# Examples
# --------
# >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
# ...                'hippo'], name='animal')
# >>> s.isin(['cow', 'lama'])
# 0     True
# 1     True
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# To invert the boolean values, use the ``~`` operator:
#
# >>> ~s.isin(['cow', 'lama'])
# 0    False
# 1    False
# 2    False
# 3     True
# 4    False
# 5     True
# Name: animal, dtype: bool
#
# Passing a single string as ``s.isin('lama')`` will raise an error. Use
# a list of one element instead:
#
# >>> s.isin(['lama'])
# 0     True
# 1    False
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# Strings and integers are distinct and are therefore not comparable:
#
# >>> pd.Series([1]).isin(['1'])
# 0    False
# dtype: bool
# >>> pd.Series([1.1]).isin(['1.1'])
# 0    False
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.info</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Print a concise summary of a Series.
#
# This method prints information about a Series including
# the index dtype, non-null values and memory usage.
#
# .. versionadded:: 1.4.0
#
# Parameters
# ----------
# data : Series
#     Series to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the Series
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
#
# Returns
# -------
# None
#     This method prints a summary of a Series and returns None.
#
# See Also
# --------
# Series.describe: Generate descriptive statistics of Series.
# Series.memory_usage: Memory usage of Series.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> s = pd.Series(text_values, index=int_values)
# >>> s.info()
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# Series name: None
# Non-Null Count  Dtype
# --------------  -----
# 5 non-null      object
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Prints a summary excluding information about its values:
#
# >>> s.info(verbose=False)
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Pipe output of Series.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> s.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big Series and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
# >>> s.info()
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 7.6+ MB
#
# >>> s.info(memory_usage='deep')
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 55.3 MB
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.info</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Print a concise summary of a DataFrame.
#
# This method prints information about a DataFrame including
# the index dtype and columns, non-null values and memory usage.
#
# Parameters
# ----------
# data : DataFrame
#     DataFrame to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    max_cols : int, optional
#     When to switch from the verbose to the truncated output. If the
#     DataFrame has more than `max_cols` columns, the truncated output
#     is used. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is used.
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the DataFrame
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
# null_counts : bool, optional
#     .. deprecated:: 1.2.0
#         Use show_counts instead.
#
# Returns
# -------
# None
#     This method prints a summary of a DataFrame and returns None.
#
# See Also
# --------
# DataFrame.describe: Generate descriptive statistics of DataFrame
#     columns.
# DataFrame.memory_usage: Memory usage of DataFrame columns.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
# >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
# ...                   "float_col": float_values})
# >>> df
#     int_col text_col  float_col
# 0        1    alpha       0.00
# 1        2     beta       0.25
# 2        3    gamma       0.50
# 3        4    delta       0.75
# 4        5  epsilon       1.00
#
# Prints information of all columns:
#
# >>> df.info(verbose=True)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Data columns (total 3 columns):
#  #   Column     Non-Null Count  Dtype
# ---  ------     --------------  -----
#  0   int_col    5 non-null      int64
#  1   text_col   5 non-null      object
#  2   float_col  5 non-null      float64
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Prints a summary of columns count and its dtypes but not per column
# information:
#
# >>> df.info(verbose=False)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Columns: 3 entries, int_col to float_col
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Pipe output of DataFrame.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> df.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big DataFrames and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> df = pd.DataFrame({
# ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
# ... })
# >>> df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 22.9+ MB
#
# >>> df.info(memory_usage='deep')
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 165.9 MB
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.isnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# DataFrame.isnull is an alias for DataFrame.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# DataFrame
#     Mask of bool values for each element in DataFrame that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# DataFrame.isnull : Alias of isna.
# DataFrame.notna : Boolean inverse of isna.
# DataFrame.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.isnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Series.isnull is an alias for Series.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# Series.isnull : Alias of isna.
# Series.notna : Boolean inverse of isna.
# Series.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 2</u></h3></summary><small><a href=#2>goto cell # 2</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [2] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 3</u></h3></summary><small><a href=#3>goto cell # 3</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [2] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 4</u></h3></summary><small><a href=#4>goto cell # 4</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [2] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 5</u></h3></summary><small><a href=#5>goto cell # 5</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [2] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 6</u></h3></summary><small><a href=#6>goto cell # 6</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [2] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 7</u></h3></summary><small><a href=#7>goto cell # 7</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 9</u></h3></summary><small><a href=#9>goto cell # 9</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [2] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 12</u></h3></summary><small><a href=#12>goto cell # 12</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.isin</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Whether elements in Series are contained in `values`.
#
# Return a boolean Series showing whether each element in the Series
# matches an element in the passed sequence of `values` exactly.
#
# Parameters
# ----------
# values : set or list-like
#     The sequence of values to test. Passing in a single string will
#     raise a ``TypeError``. Instead, turn a single string into a
#     list of one element.
#
# Returns
# -------
# Series
#     Series of booleans indicating if each element is in values.
#
# Raises
# ------
# TypeError
#   * If `values` is a string
#
# See Also
# --------
# DataFrame.isin : Equivalent method on DataFrame.
#
# Examples
# --------
# >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
# ...                'hippo'], name='animal')
# >>> s.isin(['cow', 'lama'])
# 0     True
# 1     True
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# To invert the boolean values, use the ``~`` operator:
#
# >>> ~s.isin(['cow', 'lama'])
# 0    False
# 1    False
# 2    False
# 3     True
# 4    False
# 5     True
# Name: animal, dtype: bool
#
# Passing a single string as ``s.isin('lama')`` will raise an error. Use
# a list of one element instead:
#
# >>> s.isin(['lama'])
# 0     True
# 1    False
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# Strings and integers are distinct and are therefore not comparable:
#
# >>> pd.Series([1]).isin(['1'])
# 0    False
# dtype: bool
# >>> pd.Series([1.1]).isin(['1.1'])
# 0    False
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 13</u></h3></summary><small><a href=#13>goto cell # 13</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.info</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Print a concise summary of a Series.
#
# This method prints information about a Series including
# the index dtype, non-null values and memory usage.
#
# .. versionadded:: 1.4.0
#
# Parameters
# ----------
# data : Series
#     Series to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the Series
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
#
# Returns
# -------
# None
#     This method prints a summary of a Series and returns None.
#
# See Also
# --------
# Series.describe: Generate descriptive statistics of Series.
# Series.memory_usage: Memory usage of Series.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> s = pd.Series(text_values, index=int_values)
# >>> s.info()
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# Series name: None
# Non-Null Count  Dtype
# --------------  -----
# 5 non-null      object
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Prints a summary excluding information about its values:
#
# >>> s.info(verbose=False)
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Pipe output of Series.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> s.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big Series and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
# >>> s.info()
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 7.6+ MB
#
# >>> s.info(memory_usage='deep')
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 55.3 MB
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.info</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Print a concise summary of a DataFrame.
#
# This method prints information about a DataFrame including
# the index dtype and columns, non-null values and memory usage.
#
# Parameters
# ----------
# data : DataFrame
#     DataFrame to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    max_cols : int, optional
#     When to switch from the verbose to the truncated output. If the
#     DataFrame has more than `max_cols` columns, the truncated output
#     is used. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is used.
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the DataFrame
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
# null_counts : bool, optional
#     .. deprecated:: 1.2.0
#         Use show_counts instead.
#
# Returns
# -------
# None
#     This method prints a summary of a DataFrame and returns None.
#
# See Also
# --------
# DataFrame.describe: Generate descriptive statistics of DataFrame
#     columns.
# DataFrame.memory_usage: Memory usage of DataFrame columns.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
# >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
# ...                   "float_col": float_values})
# >>> df
#     int_col text_col  float_col
# 0        1    alpha       0.00
# 1        2     beta       0.25
# 2        3    gamma       0.50
# 3        4    delta       0.75
# 4        5  epsilon       1.00
#
# Prints information of all columns:
#
# >>> df.info(verbose=True)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Data columns (total 3 columns):
#  #   Column     Non-Null Count  Dtype
# ---  ------     --------------  -----
#  0   int_col    5 non-null      int64
#  1   text_col   5 non-null      object
#  2   float_col  5 non-null      float64
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Prints a summary of columns count and its dtypes but not per column
# information:
#
# >>> df.info(verbose=False)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Columns: 3 entries, int_col to float_col
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Pipe output of DataFrame.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> df.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big DataFrames and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> df = pd.DataFrame({
# ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
# ... })
# >>> df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 22.9+ MB
#
# >>> df.info(memory_usage='deep')
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 165.9 MB
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 14</u></h3></summary><small><a href=#14>goto cell # 14</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 16</u></h3></summary><small><a href=#16>goto cell # 16</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 18</u></h3></summary><small><a href=#18>goto cell # 18</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [2] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 20</u></h3></summary><small><a href=#20>goto cell # 20</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 21</u></h3></summary><small><a href=#21>goto cell # 21</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.isnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# DataFrame.isnull is an alias for DataFrame.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# DataFrame
#     Mask of bool values for each element in DataFrame that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# DataFrame.isnull : Alias of isna.
# DataFrame.notna : Boolean inverse of isna.
# DataFrame.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 23</u></h3></summary><small><a href=#23>goto cell # 23</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.isin</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Whether elements in Series are contained in `values`.
#
# Return a boolean Series showing whether each element in the Series
# matches an element in the passed sequence of `values` exactly.
#
# Parameters
# ----------
# values : set or list-like
#     The sequence of values to test. Passing in a single string will
#     raise a ``TypeError``. Instead, turn a single string into a
#     list of one element.
#
# Returns
# -------
# Series
#     Series of booleans indicating if each element is in values.
#
# Raises
# ------
# TypeError
#   * If `values` is a string
#
# See Also
# --------
# DataFrame.isin : Equivalent method on DataFrame.
#
# Examples
# --------
# >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
# ...                'hippo'], name='animal')
# >>> s.isin(['cow', 'lama'])
# 0     True
# 1     True
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# To invert the boolean values, use the ``~`` operator:
#
# >>> ~s.isin(['cow', 'lama'])
# 0    False
# 1    False
# 2    False
# 3     True
# 4    False
# 5     True
# Name: animal, dtype: bool
#
# Passing a single string as ``s.isin('lama')`` will raise an error. Use
# a list of one element instead:
#
# >>> s.isin(['lama'])
# 0     True
# 1    False
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# Strings and integers are distinct and are therefore not comparable:
#
# >>> pd.Series([1]).isin(['1'])
# 0    False
# dtype: bool
# >>> pd.Series([1.1]).isin(['1.1'])
# 0    False
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 25</u></h3></summary><small><a href=#25>goto cell # 25</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.isnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# DataFrame.isnull is an alias for DataFrame.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# DataFrame
#     Mask of bool values for each element in DataFrame that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# DataFrame.isnull : Alias of isna.
# DataFrame.notna : Boolean inverse of isna.
# DataFrame.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 27</u></h3></summary><small><a href=#27>goto cell # 27</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 29</u></h3></summary><small><a href=#29>goto cell # 29</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [2] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 30</u></h3></summary><small><a href=#30>goto cell # 30</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.info</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Print a concise summary of a Series.
#
# This method prints information about a Series including
# the index dtype, non-null values and memory usage.
#
# .. versionadded:: 1.4.0
#
# Parameters
# ----------
# data : Series
#     Series to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the Series
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
#
# Returns
# -------
# None
#     This method prints a summary of a Series and returns None.
#
# See Also
# --------
# Series.describe: Generate descriptive statistics of Series.
# Series.memory_usage: Memory usage of Series.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> s = pd.Series(text_values, index=int_values)
# >>> s.info()
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# Series name: None
# Non-Null Count  Dtype
# --------------  -----
# 5 non-null      object
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Prints a summary excluding information about its values:
#
# >>> s.info(verbose=False)
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Pipe output of Series.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> s.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big Series and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
# >>> s.info()
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 7.6+ MB
#
# >>> s.info(memory_usage='deep')
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 55.3 MB
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.info</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Print a concise summary of a DataFrame.
#
# This method prints information about a DataFrame including
# the index dtype and columns, non-null values and memory usage.
#
# Parameters
# ----------
# data : DataFrame
#     DataFrame to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    max_cols : int, optional
#     When to switch from the verbose to the truncated output. If the
#     DataFrame has more than `max_cols` columns, the truncated output
#     is used. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is used.
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the DataFrame
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
# null_counts : bool, optional
#     .. deprecated:: 1.2.0
#         Use show_counts instead.
#
# Returns
# -------
# None
#     This method prints a summary of a DataFrame and returns None.
#
# See Also
# --------
# DataFrame.describe: Generate descriptive statistics of DataFrame
#     columns.
# DataFrame.memory_usage: Memory usage of DataFrame columns.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
# >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
# ...                   "float_col": float_values})
# >>> df
#     int_col text_col  float_col
# 0        1    alpha       0.00
# 1        2     beta       0.25
# 2        3    gamma       0.50
# 3        4    delta       0.75
# 4        5  epsilon       1.00
#
# Prints information of all columns:
#
# >>> df.info(verbose=True)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Data columns (total 3 columns):
#  #   Column     Non-Null Count  Dtype
# ---  ------     --------------  -----
#  0   int_col    5 non-null      int64
#  1   text_col   5 non-null      object
#  2   float_col  5 non-null      float64
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Prints a summary of columns count and its dtypes but not per column
# information:
#
# >>> df.info(verbose=False)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Columns: 3 entries, int_col to float_col
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Pipe output of DataFrame.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> df.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big DataFrames and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> df = pd.DataFrame({
# ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
# ... })
# >>> df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 22.9+ MB
#
# >>> df.info(memory_usage='deep')
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 165.9 MB
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 35</u></h3></summary><small><a href=#35>goto cell # 35</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.isnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Series.isnull is an alias for Series.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# Series.isnull : Alias of isna.
# Series.notna : Boolean inverse of isna.
# Series.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 37</u></h3></summary><small><a href=#37>goto cell # 37</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.isnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# DataFrame.isnull is an alias for DataFrame.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# DataFrame
#     Mask of bool values for each element in DataFrame that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# DataFrame.isnull : Alias of isna.
# DataFrame.notna : Boolean inverse of isna.
# DataFrame.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.isnull</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Series.isnull is an alias for Series.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# Series.isnull : Alias of isna.
# Series.notna : Boolean inverse of isna.
# Series.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li></ul>
# <ul><li><details><summary><h2>Data Cleaning Filtering</h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Data Cleaning Filtering" Calls</u></b></summary>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.groupby</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Group DataFrame using a mapper or by a Series of columns.
#
# A groupby operation involves some combination of splitting the
# object, applying a function, and combining the results. This can be
# used to group large amounts of data and compute operations on these
# groups.
#
# Parameters
# ----------
# by : mapping, function, label, or list of labels
#     Used to determine the groups for the groupby.
#     If ``by`` is a function, it's called on each value of the object's
#     index. If a dict or Series is passed, the Series or dict VALUES
#     will be used to determine the groups (the Series' values are first
#     aligned; see ``.align()`` method). If a list or ndarray of length
#     equal to the selected axis is passed (see the `groupby user guide
#     <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
#     the values are used as-is to determine the groups. A label or list
#     of labels may be passed to group by the columns in ``self``.
#     Notice that a tuple is interpreted as a (single) key.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Split along rows (0) or columns (1).
# level : int, level name, or sequence of such, default None
#     If the axis is a MultiIndex (hierarchical), group by a particular
#     level or levels.
# as_index : bool, default True
#     For aggregated output, return object with group labels as the
#     index. Only relevant for DataFrame input. as_index=False is
#     effectively "SQL-style" grouped output.
# sort : bool, default True
#     Sort group keys. Get better performance by turning this off.
#     Note this does not influence the order of observations within each
#     group. Groupby preserves the order of rows within each group.
# group_keys : bool, default True
#     When calling apply, add group keys to index to identify pieces.
# squeeze : bool, default False
#     Reduce the dimensionality of the return type if possible,
#     otherwise return a consistent type.
#
#     .. deprecated:: 1.1.0
#
# observed : bool, default False
#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.
# dropna : bool, default True
#     If True, and if group keys contain NA values, NA values together
#     with row/column will be dropped.
#     If False, NA values will also be treated as the key in groups.
#
#     .. versionadded:: 1.1.0
#
# Returns
# -------
# DataFrameGroupBy
#     Returns a groupby object that contains information about the groups.
#
# See Also
# --------
# resample : Convenience method for frequency conversion and resampling
#     of time series.
#
# Notes
# -----
# See the `user guide
# <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
# detailed usage and examples, including splitting an object into groups,
# iterating through groups, selecting a group, aggregation, and more.
#
# Examples
# --------
# >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
# ...                               'Parrot', 'Parrot'],
# ...                    'Max Speed': [380., 370., 24., 26.]})
# >>> df
#    Animal  Max Speed
# 0  Falcon      380.0
# 1  Falcon      370.0
# 2  Parrot       24.0
# 3  Parrot       26.0
# >>> df.groupby(['Animal']).mean()
#         Max Speed
# Animal
# Falcon      375.0
# Parrot       25.0
#
# **Hierarchical Indexes**
#
# We can groupby different levels of a hierarchical index
# using the `level` parameter:
#
# >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
# ...           ['Captive', 'Wild', 'Captive', 'Wild']]
# >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
# >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
# ...                   index=index)
# >>> df
#                 Max Speed
# Animal Type
# Falcon Captive      390.0
#        Wild         350.0
# Parrot Captive       30.0
#        Wild          20.0
# >>> df.groupby(level=0).mean()
#         Max Speed
# Animal
# Falcon      370.0
# Parrot       25.0
# >>> df.groupby(level="Type").mean()
#          Max Speed
# Type
# Captive      210.0
# Wild         185.0
#
# We can also choose to include NA in group keys or not by setting
# `dropna` parameter, the default setting is `True`.
#
# >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by=["b"]).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
#
# >>> df.groupby(by=["b"], dropna=False).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
# NaN 1   4
#
# >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by="a").sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
#
# >>> df.groupby(by="a", dropna=False).sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
# NaN 12.3   33.0
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.fillna</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Fill NA/NaN values using the specified method.
#
# Parameters
# ----------
# value : scalar, dict, Series, or DataFrame
#     Value to use to fill holes (e.g. 0), alternately a
#     dict/Series/DataFrame of values specifying which value to use for
#     each index (for a Series) or column (for a DataFrame).  Values not
#     in the dict/Series/DataFrame will not be filled. This value cannot
#     be a list.
# method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
#     Method to use for filling holes in reindexed Series
#     pad / ffill: propagate last valid observation forward to next valid
#     backfill / bfill: use next valid observation to fill gap.
# axis : {0 or 'index'}
#     Axis along which to fill missing values.
# inplace : bool, default False
#     If True, fill in-place. Note: this will modify any
#     other views on this object (e.g., a no-copy slice for a column in a
#     DataFrame).
# limit : int, default None
#     If method is specified, this is the maximum number of consecutive
#     NaN values to forward/backward fill. In other words, if there is
#     a gap with more than this number of consecutive NaNs, it will only
#     be partially filled. If method is not specified, this is the
#     maximum number of entries along the entire axis where NaNs will be
#     filled. Must be greater than 0 if not None.
# downcast : dict, default is None
#     A dict of item->dtype of what to downcast if possible,
#     or the string 'infer' which will try to downcast to an appropriate
#     equal type (e.g. float64 to int64 if possible).
#
# Returns
# -------
# Series or None
#     Object with missing values filled or None if ``inplace=True``.
#
# See Also
# --------
# interpolate : Fill NaN values using interpolation.
# reindex : Conform object to new index.
# asfreq : Convert TimeSeries to specified frequency.
#
# Examples
# --------
# >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
# ...                    [3, 4, np.nan, 1],
# ...                    [np.nan, np.nan, np.nan, np.nan],
# ...                    [np.nan, 3, np.nan, 4]],
# ...                   columns=list("ABCD"))
# >>> df
#      A    B   C    D
# 0  NaN  2.0 NaN  0.0
# 1  3.0  4.0 NaN  1.0
# 2  NaN  NaN NaN  NaN
# 3  NaN  3.0 NaN  4.0
#
# Replace all NaN elements with 0s.
#
# >>> df.fillna(0)
#      A    B    C    D
# 0  0.0  2.0  0.0  0.0
# 1  3.0  4.0  0.0  1.0
# 2  0.0  0.0  0.0  0.0
# 3  0.0  3.0  0.0  4.0
#
# We can also propagate non-null values forward or backward.
#
# >>> df.fillna(method="ffill")
#      A    B   C    D
# 0  NaN  2.0 NaN  0.0
# 1  3.0  4.0 NaN  1.0
# 2  3.0  4.0 NaN  1.0
# 3  3.0  3.0 NaN  4.0
#
# Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
# 2, and 3 respectively.
#
# >>> values = {"A": 0, "B": 1, "C": 2, "D": 3}
# >>> df.fillna(value=values)
#      A    B    C    D
# 0  0.0  2.0  2.0  0.0
# 1  3.0  4.0  2.0  1.0
# 2  0.0  1.0  2.0  3.0
# 3  0.0  3.0  2.0  4.0
#
# Only replace the first NaN element.
#
# >>> df.fillna(value=values, limit=1)
#      A    B    C    D
# 0  0.0  2.0  2.0  0.0
# 1  3.0  4.0  NaN  1.0
# 2  NaN  1.0  NaN  3.0
# 3  NaN  3.0  NaN  4.0
#
# When filling using a DataFrame, replacement happens along
# the same column names and same indices
#
# >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
# >>> df.fillna(df2)
#      A    B    C    D
# 0  0.0  2.0  0.0  0.0
# 1  3.0  4.0  0.0  1.0
# 2  0.0  0.0  0.0  NaN
# 3  0.0  3.0  0.0  4.0
#
# Note that column D is not affected since it is not present in df2.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 15</u></h3></summary><small><a href=#15>goto cell # 15</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [['date']] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [['date']] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 19</u></h3></summary><small><a href=#19>goto cell # 19</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.groupby</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_id'] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Group DataFrame using a mapper or by a Series of columns.
#
# A groupby operation involves some combination of splitting the
# object, applying a function, and combining the results. This can be
# used to group large amounts of data and compute operations on these
# groups.
#
# Parameters
# ----------
# by : mapping, function, label, or list of labels
#     Used to determine the groups for the groupby.
#     If ``by`` is a function, it's called on each value of the object's
#     index. If a dict or Series is passed, the Series or dict VALUES
#     will be used to determine the groups (the Series' values are first
#     aligned; see ``.align()`` method). If a list or ndarray of length
#     equal to the selected axis is passed (see the `groupby user guide
#     <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
#     the values are used as-is to determine the groups. A label or list
#     of labels may be passed to group by the columns in ``self``.
#     Notice that a tuple is interpreted as a (single) key.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Split along rows (0) or columns (1).
# level : int, level name, or sequence of such, default None
#     If the axis is a MultiIndex (hierarchical), group by a particular
#     level or levels.
# as_index : bool, default True
#     For aggregated output, return object with group labels as the
#     index. Only relevant for DataFrame input. as_index=False is
#     effectively "SQL-style" grouped output.
# sort : bool, default True
#     Sort group keys. Get better performance by turning this off.
#     Note this does not influence the order of observations within each
#     group. Groupby preserves the order of rows within each group.
# group_keys : bool, default True
#     When calling apply, add group keys to index to identify pieces.
# squeeze : bool, default False
#     Reduce the dimensionality of the return type if possible,
#     otherwise return a consistent type.
#
#     .. deprecated:: 1.1.0
#
# observed : bool, default False
#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.
# dropna : bool, default True
#     If True, and if group keys contain NA values, NA values together
#     with row/column will be dropped.
#     If False, NA values will also be treated as the key in groups.
#
#     .. versionadded:: 1.1.0
#
# Returns
# -------
# DataFrameGroupBy
#     Returns a groupby object that contains information about the groups.
#
# See Also
# --------
# resample : Convenience method for frequency conversion and resampling
#     of time series.
#
# Notes
# -----
# See the `user guide
# <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
# detailed usage and examples, including splitting an object into groups,
# iterating through groups, selecting a group, aggregation, and more.
#
# Examples
# --------
# >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
# ...                               'Parrot', 'Parrot'],
# ...                    'Max Speed': [380., 370., 24., 26.]})
# >>> df
#    Animal  Max Speed
# 0  Falcon      380.0
# 1  Falcon      370.0
# 2  Parrot       24.0
# 3  Parrot       26.0
# >>> df.groupby(['Animal']).mean()
#         Max Speed
# Animal
# Falcon      375.0
# Parrot       25.0
#
# **Hierarchical Indexes**
#
# We can groupby different levels of a hierarchical index
# using the `level` parameter:
#
# >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
# ...           ['Captive', 'Wild', 'Captive', 'Wild']]
# >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
# >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
# ...                   index=index)
# >>> df
#                 Max Speed
# Animal Type
# Falcon Captive      390.0
#        Wild         350.0
# Parrot Captive       30.0
#        Wild          20.0
# >>> df.groupby(level=0).mean()
#         Max Speed
# Animal
# Falcon      370.0
# Parrot       25.0
# >>> df.groupby(level="Type").mean()
#          Max Speed
# Type
# Captive      210.0
# Wild         185.0
#
# We can also choose to include NA in group keys or not by setting
# `dropna` parameter, the default setting is `True`.
#
# >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by=["b"]).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
#
# >>> df.groupby(by=["b"], dropna=False).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
# NaN 1   4
#
# >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by="a").sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
#
# >>> df.groupby(by="a", dropna=False).sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
# NaN 12.3   33.0
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 20</u></h3></summary><small><a href=#20>goto cell # 20</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 27</u></h3></summary><small><a href=#27>goto cell # 27</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_name'] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 28</u></h3></summary><small><a href=#28>goto cell # 28</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 34</u></h3></summary><small><a href=#34>goto cell # 34</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.fillna</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Fill NA/NaN values using the specified method.
#
# Parameters
# ----------
# value : scalar, dict, Series, or DataFrame
#     Value to use to fill holes (e.g. 0), alternately a
#     dict/Series/DataFrame of values specifying which value to use for
#     each index (for a Series) or column (for a DataFrame).  Values not
#     in the dict/Series/DataFrame will not be filled. This value cannot
#     be a list.
# method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
#     Method to use for filling holes in reindexed Series
#     pad / ffill: propagate last valid observation forward to next valid
#     backfill / bfill: use next valid observation to fill gap.
# axis : {0 or 'index'}
#     Axis along which to fill missing values.
# inplace : bool, default False
#     If True, fill in-place. Note: this will modify any
#     other views on this object (e.g., a no-copy slice for a column in a
#     DataFrame).
# limit : int, default None
#     If method is specified, this is the maximum number of consecutive
#     NaN values to forward/backward fill. In other words, if there is
#     a gap with more than this number of consecutive NaNs, it will only
#     be partially filled. If method is not specified, this is the
#     maximum number of entries along the entire axis where NaNs will be
#     filled. Must be greater than 0 if not None.
# downcast : dict, default is None
#     A dict of item->dtype of what to downcast if possible,
#     or the string 'infer' which will try to downcast to an appropriate
#     equal type (e.g. float64 to int64 if possible).
#
# Returns
# -------
# Series or None
#     Object with missing values filled or None if ``inplace=True``.
#
# See Also
# --------
# interpolate : Fill NaN values using interpolation.
# reindex : Conform object to new index.
# asfreq : Convert TimeSeries to specified frequency.
#
# Examples
# --------
# >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
# ...                    [3, 4, np.nan, 1],
# ...                    [np.nan, np.nan, np.nan, np.nan],
# ...                    [np.nan, 3, np.nan, 4]],
# ...                   columns=list("ABCD"))
# >>> df
#      A    B   C    D
# 0  NaN  2.0 NaN  0.0
# 1  3.0  4.0 NaN  1.0
# 2  NaN  NaN NaN  NaN
# 3  NaN  3.0 NaN  4.0
#
# Replace all NaN elements with 0s.
#
# >>> df.fillna(0)
#      A    B    C    D
# 0  0.0  2.0  0.0  0.0
# 1  3.0  4.0  0.0  1.0
# 2  0.0  0.0  0.0  0.0
# 3  0.0  3.0  0.0  4.0
#
# We can also propagate non-null values forward or backward.
#
# >>> df.fillna(method="ffill")
#      A    B   C    D
# 0  NaN  2.0 NaN  0.0
# 1  3.0  4.0 NaN  1.0
# 2  3.0  4.0 NaN  1.0
# 3  3.0  3.0 NaN  4.0
#
# Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
# 2, and 3 respectively.
#
# >>> values = {"A": 0, "B": 1, "C": 2, "D": 3}
# >>> df.fillna(value=values)
#      A    B    C    D
# 0  0.0  2.0  2.0  0.0
# 1  3.0  4.0  2.0  1.0
# 2  0.0  1.0  2.0  3.0
# 3  0.0  3.0  2.0  4.0
#
# Only replace the first NaN element.
#
# >>> df.fillna(value=values, limit=1)
#      A    B    C    D
# 0  0.0  2.0  2.0  0.0
# 1  3.0  4.0  NaN  1.0
# 2  NaN  1.0  NaN  3.0
# 3  NaN  3.0  NaN  4.0
#
# When filling using a DataFrame, replacement happens along
# the same column names and same indices
#
# >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
# >>> df.fillna(df2)
#      A    B    C    D
# 0  0.0  2.0  0.0  0.0
# 1  3.0  4.0  0.0  1.0
# 2  0.0  0.0  0.0  NaN
# 3  0.0  3.0  0.0  4.0
#
# Note that column D is not affected since it is not present in df2.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 38</u></h3></summary><small><a href=#38>goto cell # 38</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 40</u></h3></summary><small><a href=#40>goto cell # 40</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li></ul>
# <ul><li><details><summary><h4><s>Data Sub-sampling and Train-test Splitting</s> (no calls found)</h4></summary>
# <ul>
#
# None
#
# </ul>
# </details></li></ul>
# <li><details><summary><h2><span style='color:#42a5f5'>Feature Engineering</span></h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Feature Engineering" Calls</u></b></summary>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [[]] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 32</u></h3></summary><small><a href=#32>goto cell # 32</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [[]] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 34</u></h3></summary><small><a href=#34>goto cell # 34</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [[]] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 50</u></h3></summary><small><a href=#50>goto cell # 50</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [[]] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li>
# <ul><li><details><summary><h2>Feature Transformation</h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Feature Transformation" Calls</u></b></summary>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.mean</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the mean of the values over the requested axis.
#
# Parameters
# ----------
# axis : {index (0)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a scalar.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# scalar or Series (if level specified)
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>sklearn</b>
# <ul>
# <li>
# <details><summary><u>sklearn.base.TransformerMixin.fit_transform</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Fit to data, then transform it.
#
# Fits transformer to `X` and `y` with optional parameters `fit_params`
# and returns a transformed version of `X`.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input samples.
#
# y :  array-like of shape (n_samples,) or (n_samples, n_outputs),                 default=None
#     Target values (None for unsupervised transformations).
#
# **fit_params : dict
#     Additional fit parameters.
#
# Returns
# -------
# X_new : ndarray array of shape (n_samples, n_features_new)
#     Transformed array.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler.transform</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Scale features of X according to feature_range.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input data that will be transformed.
#
# Returns
# -------
# Xt : ndarray of shape (n_samples, n_features)
#     Transformed data.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 20</u></h3></summary><small><a href=#20>goto cell # 20</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 21</u></h3></summary><small><a href=#21>goto cell # 21</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 25</u></h3></summary><small><a href=#25>goto cell # 25</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 28</u></h3></summary><small><a href=#28>goto cell # 28</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 34</u></h3></summary><small><a href=#34>goto cell # 34</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.mean</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the mean of the values over the requested axis.
#
# Parameters
# ----------
# axis : {index (0)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a scalar.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# scalar or Series (if level specified)
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 37</u></h3></summary><small><a href=#37>goto cell # 37</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 42</u></h3></summary><small><a href=#42>goto cell # 42</a></small>
# <ul>
#
# <li> <b>sklearn</b>
# <ul>
# <li>
# <details><summary><u>sklearn.base.TransformerMixin.fit_transform</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Fit to data, then transform it.
#
# Fits transformer to `X` and `y` with optional parameters `fit_params`
# and returns a transformed version of `X`.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input samples.
#
# y :  array-like of shape (n_samples,) or (n_samples, n_outputs),                 default=None
#     Target values (None for unsupervised transformations).
#
# **fit_params : dict
#     Additional fit parameters.
#
# Returns
# -------
# X_new : ndarray array of shape (n_samples, n_features_new)
#     Transformed array.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler.transform</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Scale features of X according to feature_range.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input data that will be transformed.
#
# Returns
# -------
# Xt : ndarray of shape (n_samples, n_features)
#     Transformed data.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li></ul>
# <ul><li><details><summary><h2>Feature Selection</h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Feature Selection" Calls</u></b></summary>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.groupby</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Group DataFrame using a mapper or by a Series of columns.
#
# A groupby operation involves some combination of splitting the
# object, applying a function, and combining the results. This can be
# used to group large amounts of data and compute operations on these
# groups.
#
# Parameters
# ----------
# by : mapping, function, label, or list of labels
#     Used to determine the groups for the groupby.
#     If ``by`` is a function, it's called on each value of the object's
#     index. If a dict or Series is passed, the Series or dict VALUES
#     will be used to determine the groups (the Series' values are first
#     aligned; see ``.align()`` method). If a list or ndarray of length
#     equal to the selected axis is passed (see the `groupby user guide
#     <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
#     the values are used as-is to determine the groups. A label or list
#     of labels may be passed to group by the columns in ``self``.
#     Notice that a tuple is interpreted as a (single) key.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Split along rows (0) or columns (1).
# level : int, level name, or sequence of such, default None
#     If the axis is a MultiIndex (hierarchical), group by a particular
#     level or levels.
# as_index : bool, default True
#     For aggregated output, return object with group labels as the
#     index. Only relevant for DataFrame input. as_index=False is
#     effectively "SQL-style" grouped output.
# sort : bool, default True
#     Sort group keys. Get better performance by turning this off.
#     Note this does not influence the order of observations within each
#     group. Groupby preserves the order of rows within each group.
# group_keys : bool, default True
#     When calling apply, add group keys to index to identify pieces.
# squeeze : bool, default False
#     Reduce the dimensionality of the return type if possible,
#     otherwise return a consistent type.
#
#     .. deprecated:: 1.1.0
#
# observed : bool, default False
#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.
# dropna : bool, default True
#     If True, and if group keys contain NA values, NA values together
#     with row/column will be dropped.
#     If False, NA values will also be treated as the key in groups.
#
#     .. versionadded:: 1.1.0
#
# Returns
# -------
# DataFrameGroupBy
#     Returns a groupby object that contains information about the groups.
#
# See Also
# --------
# resample : Convenience method for frequency conversion and resampling
#     of time series.
#
# Notes
# -----
# See the `user guide
# <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
# detailed usage and examples, including splitting an object into groups,
# iterating through groups, selecting a group, aggregation, and more.
#
# Examples
# --------
# >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
# ...                               'Parrot', 'Parrot'],
# ...                    'Max Speed': [380., 370., 24., 26.]})
# >>> df
#    Animal  Max Speed
# 0  Falcon      380.0
# 1  Falcon      370.0
# 2  Parrot       24.0
# 3  Parrot       26.0
# >>> df.groupby(['Animal']).mean()
#         Max Speed
# Animal
# Falcon      375.0
# Parrot       25.0
#
# **Hierarchical Indexes**
#
# We can groupby different levels of a hierarchical index
# using the `level` parameter:
#
# >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
# ...           ['Captive', 'Wild', 'Captive', 'Wild']]
# >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
# >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
# ...                   index=index)
# >>> df
#                 Max Speed
# Animal Type
# Falcon Captive      390.0
#        Wild         350.0
# Parrot Captive       30.0
#        Wild          20.0
# >>> df.groupby(level=0).mean()
#         Max Speed
# Animal
# Falcon      370.0
# Parrot       25.0
# >>> df.groupby(level="Type").mean()
#          Max Speed
# Type
# Captive      210.0
# Wild         185.0
#
# We can also choose to include NA in group keys or not by setting
# `dropna` parameter, the default setting is `True`.
#
# >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by=["b"]).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
#
# >>> df.groupby(by=["b"], dropna=False).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
# NaN 1   4
#
# >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by="a").sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
#
# >>> df.groupby(by="a", dropna=False).sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
# NaN 12.3   33.0
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 15</u></h3></summary><small><a href=#15>goto cell # 15</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [['date']] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [['date']] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 19</u></h3></summary><small><a href=#19>goto cell # 19</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.groupby</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_id'] | <b>Kwargs:</b> {}</li></ul>
# <blockquote>
# <code>
# Group DataFrame using a mapper or by a Series of columns.
#
# A groupby operation involves some combination of splitting the
# object, applying a function, and combining the results. This can be
# used to group large amounts of data and compute operations on these
# groups.
#
# Parameters
# ----------
# by : mapping, function, label, or list of labels
#     Used to determine the groups for the groupby.
#     If ``by`` is a function, it's called on each value of the object's
#     index. If a dict or Series is passed, the Series or dict VALUES
#     will be used to determine the groups (the Series' values are first
#     aligned; see ``.align()`` method). If a list or ndarray of length
#     equal to the selected axis is passed (see the `groupby user guide
#     <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
#     the values are used as-is to determine the groups. A label or list
#     of labels may be passed to group by the columns in ``self``.
#     Notice that a tuple is interpreted as a (single) key.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Split along rows (0) or columns (1).
# level : int, level name, or sequence of such, default None
#     If the axis is a MultiIndex (hierarchical), group by a particular
#     level or levels.
# as_index : bool, default True
#     For aggregated output, return object with group labels as the
#     index. Only relevant for DataFrame input. as_index=False is
#     effectively "SQL-style" grouped output.
# sort : bool, default True
#     Sort group keys. Get better performance by turning this off.
#     Note this does not influence the order of observations within each
#     group. Groupby preserves the order of rows within each group.
# group_keys : bool, default True
#     When calling apply, add group keys to index to identify pieces.
# squeeze : bool, default False
#     Reduce the dimensionality of the return type if possible,
#     otherwise return a consistent type.
#
#     .. deprecated:: 1.1.0
#
# observed : bool, default False
#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.
# dropna : bool, default True
#     If True, and if group keys contain NA values, NA values together
#     with row/column will be dropped.
#     If False, NA values will also be treated as the key in groups.
#
#     .. versionadded:: 1.1.0
#
# Returns
# -------
# DataFrameGroupBy
#     Returns a groupby object that contains information about the groups.
#
# See Also
# --------
# resample : Convenience method for frequency conversion and resampling
#     of time series.
#
# Notes
# -----
# See the `user guide
# <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
# detailed usage and examples, including splitting an object into groups,
# iterating through groups, selecting a group, aggregation, and more.
#
# Examples
# --------
# >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
# ...                               'Parrot', 'Parrot'],
# ...                    'Max Speed': [380., 370., 24., 26.]})
# >>> df
#    Animal  Max Speed
# 0  Falcon      380.0
# 1  Falcon      370.0
# 2  Parrot       24.0
# 3  Parrot       26.0
# >>> df.groupby(['Animal']).mean()
#         Max Speed
# Animal
# Falcon      375.0
# Parrot       25.0
#
# **Hierarchical Indexes**
#
# We can groupby different levels of a hierarchical index
# using the `level` parameter:
#
# >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
# ...           ['Captive', 'Wild', 'Captive', 'Wild']]
# >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
# >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
# ...                   index=index)
# >>> df
#                 Max Speed
# Animal Type
# Falcon Captive      390.0
#        Wild         350.0
# Parrot Captive       30.0
#        Wild          20.0
# >>> df.groupby(level=0).mean()
#         Max Speed
# Animal
# Falcon      370.0
# Parrot       25.0
# >>> df.groupby(level="Type").mean()
#          Max Speed
# Type
# Captive      210.0
# Wild         185.0
#
# We can also choose to include NA in group keys or not by setting
# `dropna` parameter, the default setting is `True`.
#
# >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by=["b"]).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
#
# >>> df.groupby(by=["b"], dropna=False).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
# NaN 1   4
#
# >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by="a").sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
#
# >>> df.groupby(by="a", dropna=False).sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
# NaN 12.3   33.0
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 27</u></h3></summary><small><a href=#27>goto cell # 27</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_name'] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 38</u></h3></summary><small><a href=#38>goto cell # 38</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1, 'inplace': True}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 40</u></h3></summary><small><a href=#40>goto cell # 40</a></small>
# <ul>
#
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> ['item_cnt_day'] | <b>Kwargs:</b> {'axis': 1}</li></ul>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li></ul>
# <li><details><summary><h2><span style='color:#42a5f5'>Model Building and Training</span></h2></summary>
# <ul>
#
# None
#
# </ul>
# </details></li>
# <ul><li><details><summary><h2>Model Training</h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Model Training" Calls</u></b></summary>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras.engine.sequential.Sequential</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# `Sequential` groups a linear stack of layers into a `tf.keras.Model`.
#
# `Sequential` provides training and inference features on this model.
#
# Examples:
#
# ```python
# Optionally, the first layer can receive an `input_shape` argument:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
# Afterwards, we do automatic shape inference:
# model.add(tf.keras.layers.Dense(4))
#
# This is identical to the following:
# model = tf.keras.Sequential()
# model.add(tf.keras.Input(shape=(16,)))
# model.add(tf.keras.layers.Dense(8))
#
# Note that you can also omit the `input_shape` argument.
# In that case the model doesn't have any weights until the first call
# to a training/evaluation method (since it isn't yet built):
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(4))
# model.weights not created yet
#
# Whereas if you specify the input shape, the model gets built
# continuously as you are adding layers:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
# model.add(tf.keras.layers.Dense(4))
# len(model.weights)
# Returns "4"
#
# When using the delayed-build pattern (no input shape specified), you can
# choose to manually build your model by calling
# `build(batch_input_shape)`:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(4))
# model.build((None, 16))
# len(model.weights)
# Returns "4"
#
# Note that when using the delayed-build pattern (no input shape specified),
# the model gets built the first time you call `fit`, `eval`, or `predict`,
# or the first time you call the model on some input data.
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(1))
# model.compile(optimizer='sgd', loss='mse')
# This builds the model for the first time:
# model.fit(x, y, batch_size=32, epochs=10)
# ```
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.layers.core.dense.Dense</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Just your regular densely-connected NN layer.
#
# `Dense` implements the operation:
# `output = activation(dot(input, kernel) + bias)`
# where `activation` is the element-wise activation function
# passed as the `activation` argument, `kernel` is a weights matrix
# created by the layer, and `bias` is a bias vector created by the layer
# (only applicable if `use_bias` is `True`). These are all attributes of
# `Dense`.
#
# Note: If the input to the layer has a rank greater than 2, then `Dense`
# computes the dot product between the `inputs` and the `kernel` along the
# last axis of the `inputs` and axis 0 of the `kernel` (using `tf.tensordot`).
# For example, if input has dimensions `(batch_size, d0, d1)`,
# then we create a `kernel` with shape `(d1, units)`, and the `kernel` operates
# along axis 2 of the `input`, on every sub-tensor of shape `(1, 1, d1)`
# (there are `batch_size * d0` such sub-tensors).
# The output in this case will have shape `(batch_size, d0, units)`.
#
# Besides, layer attributes cannot be modified after the layer has been called
# once (except the `trainable` attribute).
# When a popular kwarg `input_shape` is passed, then keras will create
# an input layer to insert before the current layer. This can be treated
# equivalent to explicitly defining an `InputLayer`.
#
# Example:
#
# >>> # Create a `Sequential` model and add a Dense layer as the first layer.
# >>> model = tf.keras.models.Sequential()
# >>> model.add(tf.keras.Input(shape=(16,)))
# >>> model.add(tf.keras.layers.Dense(32, activation='relu'))
# >>> # Now the model will take as input arrays of shape (None, 16)
# >>> # and output arrays of shape (None, 32).
# >>> # Note that after the first layer, you don't need to specify
# >>> # the size of the input anymore:
# >>> model.add(tf.keras.layers.Dense(32))
# >>> model.output_shape
# (None, 32)
#
# Args:
#   units: Positive integer, dimensionality of the output space.
#   activation: Activation function to use.
#     If you don't specify anything, no activation is applied
#     (ie. "linear" activation: `a(x) = x`).
#   use_bias: Boolean, whether the layer uses a bias vector.
#   kernel_initializer: Initializer for the `kernel` weights matrix.
#   bias_initializer: Initializer for the bias vector.
#   kernel_regularizer: Regularizer function applied to
#     the `kernel` weights matrix.
#   bias_regularizer: Regularizer function applied to the bias vector.
#   activity_regularizer: Regularizer function applied to
#     the output of the layer (its "activation").
#   kernel_constraint: Constraint function applied to
#     the `kernel` weights matrix.
#   bias_constraint: Constraint function applied to the bias vector.
#
# Input shape:
#   N-D tensor with shape: `(batch_size, ..., input_dim)`.
#   The most common situation would be
#   a 2D input with shape `(batch_size, input_dim)`.
#
# Output shape:
#   N-D tensor with shape: `(batch_size, ..., units)`.
#   For instance, for a 2D input with shape `(batch_size, input_dim)`,
#   the output would have shape `(batch_size, units)`.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.sequential.Sequential.add</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Adds a layer instance on top of the layer stack.
#
# Args:
#     layer: layer instance.
#
# Raises:
#     TypeError: If `layer` is not a layer instance.
#     ValueError: In case the `layer` argument does not
#         know its input shape.
#     ValueError: In case the `layer` argument has
#         multiple output tensors, or is already connected
#         somewhere else (forbidden in `Sequential` models).
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.training.Model.compile</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Configures the model for training.
#
# Example:
#
# ```python
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
#               loss=tf.keras.losses.BinaryCrossentropy(),
#               metrics=[tf.keras.metrics.BinaryAccuracy(),
#                        tf.keras.metrics.FalseNegatives()])
# ```
#
# Args:
#     optimizer: String (name of optimizer) or optimizer instance. See
#       `tf.keras.optimizers`.
#     loss: Loss function. Maybe be a string (name of loss function), or
#       a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss
#       function is any callable with the signature `loss = fn(y_true,
#       y_pred)`, where `y_true` are the ground truth values, and
#       `y_pred` are the model's predictions.
#       `y_true` should have shape
#       `(batch_size, d0, .. dN)` (except in the case of
#       sparse loss functions such as
#       sparse categorical crossentropy which expects integer arrays of shape
#       `(batch_size, d0, .. dN-1)`).
#       `y_pred` should have shape `(batch_size, d0, .. dN)`.
#       The loss function should return a float tensor.
#       If a custom `Loss` instance is
#       used and reduction is set to `None`, return value has shape
#       `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss
#       values; otherwise, it is a scalar. If the model has multiple outputs,
#       you can use a different loss on each output by passing a dictionary
#       or a list of losses. The loss value that will be minimized by the
#       model will then be the sum of all individual losses, unless
#       `loss_weights` is specified.
#     metrics: List of metrics to be evaluated by the model during training
#       and testing. Each of this can be a string (name of a built-in
#       function), function or a `tf.keras.metrics.Metric` instance. See
#       `tf.keras.metrics`. Typically you will use `metrics=['accuracy']`. A
#       function is any callable with the signature `result = fn(y_true,
#       y_pred)`. To specify different metrics for different outputs of a
#       multi-output model, you could also pass a dictionary, such as
#       `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
#       You can also pass a list to specify a metric or a list of metrics
#       for each output, such as `metrics=[['accuracy'], ['accuracy', 'mse']]`
#       or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the
#       strings 'accuracy' or 'acc', we convert this to one of
#       `tf.keras.metrics.BinaryAccuracy`,
#       `tf.keras.metrics.CategoricalAccuracy`,
#       `tf.keras.metrics.SparseCategoricalAccuracy` based on the loss
#       function used and the model output shape. We do a similar
#       conversion for the strings 'crossentropy' and 'ce' as well.
#     loss_weights: Optional list or dictionary specifying scalar coefficients
#       (Python floats) to weight the loss contributions of different model
#       outputs. The loss value that will be minimized by the model will then
#       be the *weighted sum* of all individual losses, weighted by the
#       `loss_weights` coefficients.
#         If a list, it is expected to have a 1:1 mapping to the model's
#           outputs. If a dict, it is expected to map output names (strings)
#           to scalar coefficients.
#     weighted_metrics: List of metrics to be evaluated and weighted by
#       `sample_weight` or `class_weight` during training and testing.
#     run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s
#       logic will not be wrapped in a `tf.function`. Recommended to leave
#       this as `None` unless your `Model` cannot be run inside a
#       `tf.function`. `run_eagerly=True` is not supported when using
#       `tf.distribute.experimental.ParameterServerStrategy`.
#     steps_per_execution: Int. Defaults to 1. The number of batches to run
#       during each `tf.function` call. Running multiple batches inside a
#       single `tf.function` call can greatly improve performance on TPUs or
#       small models with a large Python overhead. At most, one full epoch
#       will be run each execution. If a number larger than the size of the
#       epoch is passed, the execution will be truncated to the size of the
#       epoch. Note that if `steps_per_execution` is set to `N`,
#       `Callback.on_batch_begin` and `Callback.on_batch_end` methods will
#       only be called every `N` batches (i.e. before/after each `tf.function`
#       execution).
#     jit_compile: If `True`, compile the model training step with XLA.
#       [XLA](https://www.tensorflow.org/xla) is an optimizing compiler for
#       machine learning.
#       `jit_compile` is not enabled for by default.
#       This option cannot be enabled with `run_eagerly=True`.
#       Note that `jit_compile=True` is
#       may not necessarily work for all models.
#       For more information on supported operations please refer to the
#       [XLA documentation](https://www.tensorflow.org/xla).
#       Also refer to
#       [known XLA issues](https://www.tensorflow.org/xla/known_issues) for
#       more details.
#     **kwargs: Arguments supported for backwards compatibility only.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.training.Model.fit</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [] | <b>Kwargs:</b> {'epochs': 32, 'validation_split': 0.2}</li></ul>
# <blockquote>
# <code>
# Trains the model for a fixed number of epochs (iterations on a dataset).
#
# Args:
#     x: Input data. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A dict mapping input names to the corresponding array/tensors,
#         if the model has named inputs.
#       - A `tf.data` dataset. Should return a tuple
#         of either `(inputs, targets)` or
#         `(inputs, targets, sample_weights)`.
#       - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
#         or `(inputs, targets, sample_weights)`.
#       - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a
#         callable that takes a single argument of type
#         `tf.distribute.InputContext`, and returns a `tf.data.Dataset`.
#         `DatasetCreator` should be used when users prefer to specify the
#         per-replica batching and sharding logic for the `Dataset`.
#         See `tf.keras.utils.experimental.DatasetCreator` doc for more
#         information.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given below. If using
#       `tf.distribute.experimental.ParameterServerStrategy`, only
#       `DatasetCreator` type is supported for `x`.
#     y: Target data. Like the input data `x`,
#       it could be either Numpy array(s) or TensorFlow tensor(s).
#       It should be consistent with `x` (you cannot have Numpy inputs and
#       tensor targets, or inversely). If `x` is a dataset, generator,
#       or `keras.utils.Sequence` instance, `y` should
#       not be specified (since targets will be obtained from `x`).
#     batch_size: Integer or `None`.
#         Number of samples per gradient update.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of datasets, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     epochs: Integer. Number of epochs to train the model.
#         An epoch is an iteration over the entire `x` and `y`
#         data provided
#         (unless the `steps_per_epoch` flag is set to
#         something other than None).
#         Note that in conjunction with `initial_epoch`,
#         `epochs` is to be understood as "final epoch".
#         The model is not trained for a number of iterations
#         given by `epochs`, but merely until the epoch
#         of index `epochs` is reached.
#     verbose: 'auto', 0, 1, or 2. Verbosity mode.
#         0 = silent, 1 = progress bar, 2 = one line per epoch.
#         'auto' defaults to 1 for most cases, but 2 when used with
#         `ParameterServerStrategy`. Note that the progress bar is not
#         particularly useful when logged to a file, so verbose=2 is
#         recommended when not running interactively (eg, in a production
#         environment).
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during training.
#         See `tf.keras.callbacks`. Note `tf.keras.callbacks.ProgbarLogger`
#         and `tf.keras.callbacks.History` callbacks are created automatically
#         and need not be passed into `model.fit`.
#         `tf.keras.callbacks.ProgbarLogger` is created or not based on
#         `verbose` argument to `model.fit`.
#         Callbacks with batch-level calls are currently unsupported with
#         `tf.distribute.experimental.ParameterServerStrategy`, and users are
#         advised to implement epoch-level calls instead with an appropriate
#         `steps_per_epoch` value.
#     validation_split: Float between 0 and 1.
#         Fraction of the training data to be used as validation data.
#         The model will set apart this fraction of the training data,
#         will not train on it, and will evaluate
#         the loss and any model metrics
#         on this data at the end of each epoch.
#         The validation data is selected from the last samples
#         in the `x` and `y` data provided, before shuffling. This argument is
#         not supported when `x` is a dataset, generator or
#        `keras.utils.Sequence` instance.
#         `validation_split` is not yet supported with
#         `tf.distribute.experimental.ParameterServerStrategy`.
#     validation_data: Data on which to evaluate
#         the loss and any model metrics at the end of each epoch.
#         The model will not be trained on this data. Thus, note the fact
#         that the validation loss of data provided using `validation_split`
#         or `validation_data` is not affected by regularization layers like
#         noise and dropout.
#         `validation_data` will override `validation_split`.
#         `validation_data` could be:
#           - A tuple `(x_val, y_val)` of Numpy arrays or tensors.
#           - A tuple `(x_val, y_val, val_sample_weights)` of NumPy arrays.
#           - A `tf.data.Dataset`.
#           - A Python generator or `keras.utils.Sequence` returning
#           `(inputs, targets)` or `(inputs, targets, sample_weights)`.
#         `validation_data` is not yet supported with
#         `tf.distribute.experimental.ParameterServerStrategy`.
#     shuffle: Boolean (whether to shuffle the training data
#         before each epoch) or str (for 'batch'). This argument is ignored
#         when `x` is a generator or an object of tf.data.Dataset.
#         'batch' is a special option for dealing
#         with the limitations of HDF5 data; it shuffles in batch-sized
#         chunks. Has no effect when `steps_per_epoch` is not `None`.
#     class_weight: Optional dictionary mapping class indices (integers)
#         to a weight (float) value, used for weighting the loss function
#         (during training only).
#         This can be useful to tell the model to
#         "pay more attention" to samples from
#         an under-represented class.
#     sample_weight: Optional Numpy array of weights for
#         the training samples, used for weighting the loss function
#         (during training only). You can either pass a flat (1D)
#         Numpy array with the same length as the input samples
#         (1:1 mapping between weights and samples),
#         or in the case of temporal data,
#         you can pass a 2D array with shape
#         `(samples, sequence_length)`,
#         to apply a different weight to every timestep of every sample. This
#         argument is not supported when `x` is a dataset, generator, or
#        `keras.utils.Sequence` instance, instead provide the sample_weights
#         as the third element of `x`.
#     initial_epoch: Integer.
#         Epoch at which to start training
#         (useful for resuming a previous training run).
#     steps_per_epoch: Integer or `None`.
#         Total number of steps (batches of samples)
#         before declaring one epoch finished and starting the
#         next epoch. When training with input tensors such as
#         TensorFlow data tensors, the default `None` is equal to
#         the number of samples in your dataset divided by
#         the batch size, or 1 if that cannot be determined. If x is a
#         `tf.data` dataset, and 'steps_per_epoch'
#         is None, the epoch will run until the input dataset is exhausted.
#         When passing an infinitely repeating dataset, you must specify the
#         `steps_per_epoch` argument. If `steps_per_epoch=-1` the training
#         will run indefinitely with an infinitely repeating dataset.
#         This argument is not supported with array inputs.
#         When using `tf.distribute.experimental.ParameterServerStrategy`:
#           * `steps_per_epoch=None` is not supported.
#     validation_steps: Only relevant if `validation_data` is provided and
#         is a `tf.data` dataset. Total number of steps (batches of
#         samples) to draw before stopping when performing validation
#         at the end of every epoch. If 'validation_steps' is None, validation
#         will run until the `validation_data` dataset is exhausted. In the
#         case of an infinitely repeated dataset, it will run into an
#         infinite loop. If 'validation_steps' is specified and only part of
#         the dataset will be consumed, the evaluation will start from the
#         beginning of the dataset at each epoch. This ensures that the same
#         validation samples are used every time.
#     validation_batch_size: Integer or `None`.
#         Number of samples per validation batch.
#         If unspecified, will default to `batch_size`.
#         Do not specify the `validation_batch_size` if your data is in the
#         form of datasets, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     validation_freq: Only relevant if validation data is provided. Integer
#         or `collections.abc.Container` instance (e.g. list, tuple, etc.).
#         If an integer, specifies how many training epochs to run before a
#         new validation run is performed, e.g. `validation_freq=2` runs
#         validation every 2 epochs. If a Container, specifies the epochs on
#         which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
#         validation at the end of the 1st, 2nd, and 10th epochs.
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up
#         when using process-based threading. If unspecified, `workers`
#         will default to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# Unpacking behavior for iterator-like inputs:
#     A common pattern is to pass a tf.data.Dataset, generator, or
#   tf.keras.utils.Sequence to the `x` argument of fit, which will in fact
#   yield not only features (x) but optionally targets (y) and sample weights.
#   Keras requires that the output of such iterator-likes be unambiguous. The
#   iterator should return a tuple of length 1, 2, or 3, where the optional
#   second and third elements will be used for y and sample_weight
#   respectively. Any other type provided will be wrapped in a length one
#   tuple, effectively treating everything as 'x'. When yielding dicts, they
#   should still adhere to the top-level tuple structure.
#   e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate
#   features, targets, and weights from the keys of a single dict.
#     A notable unsupported data type is the namedtuple. The reason is that
#   it behaves like both an ordered datatype (tuple) and a mapping
#   datatype (dict). So given a namedtuple of the form:
#       `namedtuple("example_tuple", ["y", "x"])`
#   it is ambiguous whether to reverse the order of the elements when
#   interpreting the value. Even worse is a tuple of the form:
#       `namedtuple("other_tuple", ["x", "y", "z"])`
#   where it is unclear if the tuple was intended to be unpacked into x, y,
#   and sample_weight or passed through as a single element to `x`. As a
#   result the data processing code will simply raise a ValueError if it
#   encounters a namedtuple. (Along with instructions to remedy the issue.)
#
# Returns:
#     A `History` object. Its `History.history` attribute is
#     a record of training loss values and metrics values
#     at successive epochs, as well as validation loss values
#     and validation metrics values (if applicable).
#
# Raises:
#     RuntimeError: 1. If the model was never compiled or,
#     2. If `model.fit` is  wrapped in `tf.function`.
#
#     ValueError: In case of mismatch between the provided input data
#         and what the model expects or when the input data is empty.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 44</u></h3></summary><small><a href=#44>goto cell # 44</a></small>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras.engine.sequential.Sequential</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# `Sequential` groups a linear stack of layers into a `tf.keras.Model`.
#
# `Sequential` provides training and inference features on this model.
#
# Examples:
#
# ```python
# Optionally, the first layer can receive an `input_shape` argument:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
# Afterwards, we do automatic shape inference:
# model.add(tf.keras.layers.Dense(4))
#
# This is identical to the following:
# model = tf.keras.Sequential()
# model.add(tf.keras.Input(shape=(16,)))
# model.add(tf.keras.layers.Dense(8))
#
# Note that you can also omit the `input_shape` argument.
# In that case the model doesn't have any weights until the first call
# to a training/evaluation method (since it isn't yet built):
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(4))
# model.weights not created yet
#
# Whereas if you specify the input shape, the model gets built
# continuously as you are adding layers:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
# model.add(tf.keras.layers.Dense(4))
# len(model.weights)
# Returns "4"
#
# When using the delayed-build pattern (no input shape specified), you can
# choose to manually build your model by calling
# `build(batch_input_shape)`:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(4))
# model.build((None, 16))
# len(model.weights)
# Returns "4"
#
# Note that when using the delayed-build pattern (no input shape specified),
# the model gets built the first time you call `fit`, `eval`, or `predict`,
# or the first time you call the model on some input data.
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(1))
# model.compile(optimizer='sgd', loss='mse')
# This builds the model for the first time:
# model.fit(x, y, batch_size=32, epochs=10)
# ```
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.layers.core.dense.Dense</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [9] | <b>Kwargs:</b> {'kernel_initializer': 'uniform', 'activation': 'relu', 'input_dim': 6}</li></ul>
# <ul><li><b>Args:</b> [9] | <b>Kwargs:</b> {'kernel_initializer': 'uniform', 'activation': 'relu'}</li></ul>
# <ul><li><b>Args:</b> [5] | <b>Kwargs:</b> {'kernel_initializer': 'uniform', 'activation': 'relu'}</li></ul>
# <ul><li><b>Args:</b> [1] | <b>Kwargs:</b> {'kernel_initializer': 'uniform', 'activation': 'linear'}</li></ul>
# <blockquote>
# <code>
# Just your regular densely-connected NN layer.
#
# `Dense` implements the operation:
# `output = activation(dot(input, kernel) + bias)`
# where `activation` is the element-wise activation function
# passed as the `activation` argument, `kernel` is a weights matrix
# created by the layer, and `bias` is a bias vector created by the layer
# (only applicable if `use_bias` is `True`). These are all attributes of
# `Dense`.
#
# Note: If the input to the layer has a rank greater than 2, then `Dense`
# computes the dot product between the `inputs` and the `kernel` along the
# last axis of the `inputs` and axis 0 of the `kernel` (using `tf.tensordot`).
# For example, if input has dimensions `(batch_size, d0, d1)`,
# then we create a `kernel` with shape `(d1, units)`, and the `kernel` operates
# along axis 2 of the `input`, on every sub-tensor of shape `(1, 1, d1)`
# (there are `batch_size * d0` such sub-tensors).
# The output in this case will have shape `(batch_size, d0, units)`.
#
# Besides, layer attributes cannot be modified after the layer has been called
# once (except the `trainable` attribute).
# When a popular kwarg `input_shape` is passed, then keras will create
# an input layer to insert before the current layer. This can be treated
# equivalent to explicitly defining an `InputLayer`.
#
# Example:
#
# >>> # Create a `Sequential` model and add a Dense layer as the first layer.
# >>> model = tf.keras.models.Sequential()
# >>> model.add(tf.keras.Input(shape=(16,)))
# >>> model.add(tf.keras.layers.Dense(32, activation='relu'))
# >>> # Now the model will take as input arrays of shape (None, 16)
# >>> # and output arrays of shape (None, 32).
# >>> # Note that after the first layer, you don't need to specify
# >>> # the size of the input anymore:
# >>> model.add(tf.keras.layers.Dense(32))
# >>> model.output_shape
# (None, 32)
#
# Args:
#   units: Positive integer, dimensionality of the output space.
#   activation: Activation function to use.
#     If you don't specify anything, no activation is applied
#     (ie. "linear" activation: `a(x) = x`).
#   use_bias: Boolean, whether the layer uses a bias vector.
#   kernel_initializer: Initializer for the `kernel` weights matrix.
#   bias_initializer: Initializer for the bias vector.
#   kernel_regularizer: Regularizer function applied to
#     the `kernel` weights matrix.
#   bias_regularizer: Regularizer function applied to the bias vector.
#   activity_regularizer: Regularizer function applied to
#     the output of the layer (its "activation").
#   kernel_constraint: Constraint function applied to
#     the `kernel` weights matrix.
#   bias_constraint: Constraint function applied to the bias vector.
#
# Input shape:
#   N-D tensor with shape: `(batch_size, ..., input_dim)`.
#   The most common situation would be
#   a 2D input with shape `(batch_size, input_dim)`.
#
# Output shape:
#   N-D tensor with shape: `(batch_size, ..., units)`.
#   For instance, for a 2D input with shape `(batch_size, input_dim)`,
#   the output would have shape `(batch_size, units)`.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.sequential.Sequential.add</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Adds a layer instance on top of the layer stack.
#
# Args:
#     layer: layer instance.
#
# Raises:
#     TypeError: If `layer` is not a layer instance.
#     ValueError: In case the `layer` argument does not
#         know its input shape.
#     ValueError: In case the `layer` argument has
#         multiple output tensors, or is already connected
#         somewhere else (forbidden in `Sequential` models).
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 45</u></h3></summary><small><a href=#45>goto cell # 45</a></small>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.compile</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [] | <b>Kwargs:</b> {'optimizer': 'adam', 'loss': 'mean_absolute_error', 'metrics': ['mse', 'mae']}</li></ul>
# <blockquote>
# <code>
# Configures the model for training.
#
# Example:
#
# ```python
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
#               loss=tf.keras.losses.BinaryCrossentropy(),
#               metrics=[tf.keras.metrics.BinaryAccuracy(),
#                        tf.keras.metrics.FalseNegatives()])
# ```
#
# Args:
#     optimizer: String (name of optimizer) or optimizer instance. See
#       `tf.keras.optimizers`.
#     loss: Loss function. Maybe be a string (name of loss function), or
#       a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss
#       function is any callable with the signature `loss = fn(y_true,
#       y_pred)`, where `y_true` are the ground truth values, and
#       `y_pred` are the model's predictions.
#       `y_true` should have shape
#       `(batch_size, d0, .. dN)` (except in the case of
#       sparse loss functions such as
#       sparse categorical crossentropy which expects integer arrays of shape
#       `(batch_size, d0, .. dN-1)`).
#       `y_pred` should have shape `(batch_size, d0, .. dN)`.
#       The loss function should return a float tensor.
#       If a custom `Loss` instance is
#       used and reduction is set to `None`, return value has shape
#       `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss
#       values; otherwise, it is a scalar. If the model has multiple outputs,
#       you can use a different loss on each output by passing a dictionary
#       or a list of losses. The loss value that will be minimized by the
#       model will then be the sum of all individual losses, unless
#       `loss_weights` is specified.
#     metrics: List of metrics to be evaluated by the model during training
#       and testing. Each of this can be a string (name of a built-in
#       function), function or a `tf.keras.metrics.Metric` instance. See
#       `tf.keras.metrics`. Typically you will use `metrics=['accuracy']`. A
#       function is any callable with the signature `result = fn(y_true,
#       y_pred)`. To specify different metrics for different outputs of a
#       multi-output model, you could also pass a dictionary, such as
#       `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
#       You can also pass a list to specify a metric or a list of metrics
#       for each output, such as `metrics=[['accuracy'], ['accuracy', 'mse']]`
#       or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the
#       strings 'accuracy' or 'acc', we convert this to one of
#       `tf.keras.metrics.BinaryAccuracy`,
#       `tf.keras.metrics.CategoricalAccuracy`,
#       `tf.keras.metrics.SparseCategoricalAccuracy` based on the loss
#       function used and the model output shape. We do a similar
#       conversion for the strings 'crossentropy' and 'ce' as well.
#     loss_weights: Optional list or dictionary specifying scalar coefficients
#       (Python floats) to weight the loss contributions of different model
#       outputs. The loss value that will be minimized by the model will then
#       be the *weighted sum* of all individual losses, weighted by the
#       `loss_weights` coefficients.
#         If a list, it is expected to have a 1:1 mapping to the model's
#           outputs. If a dict, it is expected to map output names (strings)
#           to scalar coefficients.
#     weighted_metrics: List of metrics to be evaluated and weighted by
#       `sample_weight` or `class_weight` during training and testing.
#     run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s
#       logic will not be wrapped in a `tf.function`. Recommended to leave
#       this as `None` unless your `Model` cannot be run inside a
#       `tf.function`. `run_eagerly=True` is not supported when using
#       `tf.distribute.experimental.ParameterServerStrategy`.
#     steps_per_execution: Int. Defaults to 1. The number of batches to run
#       during each `tf.function` call. Running multiple batches inside a
#       single `tf.function` call can greatly improve performance on TPUs or
#       small models with a large Python overhead. At most, one full epoch
#       will be run each execution. If a number larger than the size of the
#       epoch is passed, the execution will be truncated to the size of the
#       epoch. Note that if `steps_per_execution` is set to `N`,
#       `Callback.on_batch_begin` and `Callback.on_batch_end` methods will
#       only be called every `N` batches (i.e. before/after each `tf.function`
#       execution).
#     jit_compile: If `True`, compile the model training step with XLA.
#       [XLA](https://www.tensorflow.org/xla) is an optimizing compiler for
#       machine learning.
#       `jit_compile` is not enabled for by default.
#       This option cannot be enabled with `run_eagerly=True`.
#       Note that `jit_compile=True` is
#       may not necessarily work for all models.
#       For more information on supported operations please refer to the
#       [XLA documentation](https://www.tensorflow.org/xla).
#       Also refer to
#       [known XLA issues](https://www.tensorflow.org/xla/known_issues) for
#       more details.
#     **kwargs: Arguments supported for backwards compatibility only.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 46</u></h3></summary><small><a href=#46>goto cell # 46</a></small>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.fit</u> | <b>(See Args)</b> </summary> <ul><li><b>Args:</b> [] | <b>Kwargs:</b> {'epochs': 32, 'validation_split': 0.2}</li></ul>
# <blockquote>
# <code>
# Trains the model for a fixed number of epochs (iterations on a dataset).
#
# Args:
#     x: Input data. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A dict mapping input names to the corresponding array/tensors,
#         if the model has named inputs.
#       - A `tf.data` dataset. Should return a tuple
#         of either `(inputs, targets)` or
#         `(inputs, targets, sample_weights)`.
#       - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
#         or `(inputs, targets, sample_weights)`.
#       - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a
#         callable that takes a single argument of type
#         `tf.distribute.InputContext`, and returns a `tf.data.Dataset`.
#         `DatasetCreator` should be used when users prefer to specify the
#         per-replica batching and sharding logic for the `Dataset`.
#         See `tf.keras.utils.experimental.DatasetCreator` doc for more
#         information.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given below. If using
#       `tf.distribute.experimental.ParameterServerStrategy`, only
#       `DatasetCreator` type is supported for `x`.
#     y: Target data. Like the input data `x`,
#       it could be either Numpy array(s) or TensorFlow tensor(s).
#       It should be consistent with `x` (you cannot have Numpy inputs and
#       tensor targets, or inversely). If `x` is a dataset, generator,
#       or `keras.utils.Sequence` instance, `y` should
#       not be specified (since targets will be obtained from `x`).
#     batch_size: Integer or `None`.
#         Number of samples per gradient update.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of datasets, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     epochs: Integer. Number of epochs to train the model.
#         An epoch is an iteration over the entire `x` and `y`
#         data provided
#         (unless the `steps_per_epoch` flag is set to
#         something other than None).
#         Note that in conjunction with `initial_epoch`,
#         `epochs` is to be understood as "final epoch".
#         The model is not trained for a number of iterations
#         given by `epochs`, but merely until the epoch
#         of index `epochs` is reached.
#     verbose: 'auto', 0, 1, or 2. Verbosity mode.
#         0 = silent, 1 = progress bar, 2 = one line per epoch.
#         'auto' defaults to 1 for most cases, but 2 when used with
#         `ParameterServerStrategy`. Note that the progress bar is not
#         particularly useful when logged to a file, so verbose=2 is
#         recommended when not running interactively (eg, in a production
#         environment).
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during training.
#         See `tf.keras.callbacks`. Note `tf.keras.callbacks.ProgbarLogger`
#         and `tf.keras.callbacks.History` callbacks are created automatically
#         and need not be passed into `model.fit`.
#         `tf.keras.callbacks.ProgbarLogger` is created or not based on
#         `verbose` argument to `model.fit`.
#         Callbacks with batch-level calls are currently unsupported with
#         `tf.distribute.experimental.ParameterServerStrategy`, and users are
#         advised to implement epoch-level calls instead with an appropriate
#         `steps_per_epoch` value.
#     validation_split: Float between 0 and 1.
#         Fraction of the training data to be used as validation data.
#         The model will set apart this fraction of the training data,
#         will not train on it, and will evaluate
#         the loss and any model metrics
#         on this data at the end of each epoch.
#         The validation data is selected from the last samples
#         in the `x` and `y` data provided, before shuffling. This argument is
#         not supported when `x` is a dataset, generator or
#        `keras.utils.Sequence` instance.
#         `validation_split` is not yet supported with
#         `tf.distribute.experimental.ParameterServerStrategy`.
#     validation_data: Data on which to evaluate
#         the loss and any model metrics at the end of each epoch.
#         The model will not be trained on this data. Thus, note the fact
#         that the validation loss of data provided using `validation_split`
#         or `validation_data` is not affected by regularization layers like
#         noise and dropout.
#         `validation_data` will override `validation_split`.
#         `validation_data` could be:
#           - A tuple `(x_val, y_val)` of Numpy arrays or tensors.
#           - A tuple `(x_val, y_val, val_sample_weights)` of NumPy arrays.
#           - A `tf.data.Dataset`.
#           - A Python generator or `keras.utils.Sequence` returning
#           `(inputs, targets)` or `(inputs, targets, sample_weights)`.
#         `validation_data` is not yet supported with
#         `tf.distribute.experimental.ParameterServerStrategy`.
#     shuffle: Boolean (whether to shuffle the training data
#         before each epoch) or str (for 'batch'). This argument is ignored
#         when `x` is a generator or an object of tf.data.Dataset.
#         'batch' is a special option for dealing
#         with the limitations of HDF5 data; it shuffles in batch-sized
#         chunks. Has no effect when `steps_per_epoch` is not `None`.
#     class_weight: Optional dictionary mapping class indices (integers)
#         to a weight (float) value, used for weighting the loss function
#         (during training only).
#         This can be useful to tell the model to
#         "pay more attention" to samples from
#         an under-represented class.
#     sample_weight: Optional Numpy array of weights for
#         the training samples, used for weighting the loss function
#         (during training only). You can either pass a flat (1D)
#         Numpy array with the same length as the input samples
#         (1:1 mapping between weights and samples),
#         or in the case of temporal data,
#         you can pass a 2D array with shape
#         `(samples, sequence_length)`,
#         to apply a different weight to every timestep of every sample. This
#         argument is not supported when `x` is a dataset, generator, or
#        `keras.utils.Sequence` instance, instead provide the sample_weights
#         as the third element of `x`.
#     initial_epoch: Integer.
#         Epoch at which to start training
#         (useful for resuming a previous training run).
#     steps_per_epoch: Integer or `None`.
#         Total number of steps (batches of samples)
#         before declaring one epoch finished and starting the
#         next epoch. When training with input tensors such as
#         TensorFlow data tensors, the default `None` is equal to
#         the number of samples in your dataset divided by
#         the batch size, or 1 if that cannot be determined. If x is a
#         `tf.data` dataset, and 'steps_per_epoch'
#         is None, the epoch will run until the input dataset is exhausted.
#         When passing an infinitely repeating dataset, you must specify the
#         `steps_per_epoch` argument. If `steps_per_epoch=-1` the training
#         will run indefinitely with an infinitely repeating dataset.
#         This argument is not supported with array inputs.
#         When using `tf.distribute.experimental.ParameterServerStrategy`:
#           * `steps_per_epoch=None` is not supported.
#     validation_steps: Only relevant if `validation_data` is provided and
#         is a `tf.data` dataset. Total number of steps (batches of
#         samples) to draw before stopping when performing validation
#         at the end of every epoch. If 'validation_steps' is None, validation
#         will run until the `validation_data` dataset is exhausted. In the
#         case of an infinitely repeated dataset, it will run into an
#         infinite loop. If 'validation_steps' is specified and only part of
#         the dataset will be consumed, the evaluation will start from the
#         beginning of the dataset at each epoch. This ensures that the same
#         validation samples are used every time.
#     validation_batch_size: Integer or `None`.
#         Number of samples per validation batch.
#         If unspecified, will default to `batch_size`.
#         Do not specify the `validation_batch_size` if your data is in the
#         form of datasets, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     validation_freq: Only relevant if validation data is provided. Integer
#         or `collections.abc.Container` instance (e.g. list, tuple, etc.).
#         If an integer, specifies how many training epochs to run before a
#         new validation run is performed, e.g. `validation_freq=2` runs
#         validation every 2 epochs. If a Container, specifies the epochs on
#         which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
#         validation at the end of the 1st, 2nd, and 10th epochs.
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up
#         when using process-based threading. If unspecified, `workers`
#         will default to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# Unpacking behavior for iterator-like inputs:
#     A common pattern is to pass a tf.data.Dataset, generator, or
#   tf.keras.utils.Sequence to the `x` argument of fit, which will in fact
#   yield not only features (x) but optionally targets (y) and sample weights.
#   Keras requires that the output of such iterator-likes be unambiguous. The
#   iterator should return a tuple of length 1, 2, or 3, where the optional
#   second and third elements will be used for y and sample_weight
#   respectively. Any other type provided will be wrapped in a length one
#   tuple, effectively treating everything as 'x'. When yielding dicts, they
#   should still adhere to the top-level tuple structure.
#   e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate
#   features, targets, and weights from the keys of a single dict.
#     A notable unsupported data type is the namedtuple. The reason is that
#   it behaves like both an ordered datatype (tuple) and a mapping
#   datatype (dict). So given a namedtuple of the form:
#       `namedtuple("example_tuple", ["y", "x"])`
#   it is ambiguous whether to reverse the order of the elements when
#   interpreting the value. Even worse is a tuple of the form:
#       `namedtuple("other_tuple", ["x", "y", "z"])`
#   where it is unclear if the tuple was intended to be unpacked into x, y,
#   and sample_weight or passed through as a single element to `x`. As a
#   result the data processing code will simply raise a ValueError if it
#   encounters a namedtuple. (Along with instructions to remedy the issue.)
#
# Returns:
#     A `History` object. Its `History.history` attribute is
#     a record of training loss values and metrics values
#     at successive epochs, as well as validation loss values
#     and validation metrics values (if applicable).
#
# Raises:
#     RuntimeError: 1. If the model was never compiled or,
#     2. If `model.fit` is  wrapped in `tf.function`.
#
#     ValueError: In case of mismatch between the provided input data
#         and what the model expects or when the input data is empty.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li></ul>
# <ul><li><details><summary><h4><s>Model Parameter Tuning</s> (no calls found)</h4></summary>
# <ul>
#
# None
#
# </ul>
# </details></li></ul>
# <ul><li><details><summary><h2>Model Validation and Assembling</h2></summary>
# <ul>
#
# <li><details><summary><b><u>View All "Model Validation and Assembling" Calls</u></b></summary>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.summary</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Prints a string summary of the network.
#
# Args:
#     line_length: Total length of printed lines
#         (e.g. set this to adapt the display to different
#         terminal window sizes).
#     positions: Relative or absolute positions of log elements
#         in each line. If not provided,
#         defaults to `[.33, .55, .67, 1.]`.
#     print_fn: Print function to use. Defaults to `print`.
#         It will be called on each line of the summary.
#         You can set it to a custom function
#         in order to capture the string summary.
#     expand_nested: Whether to expand the nested models.
#         If not provided, defaults to `False`.
#     show_trainable: Whether to show if a layer is trainable.
#         If not provided, defaults to `False`.
#
# Raises:
#     ValueError: if `summary()` is called before the model is built.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.training.Model.predict</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Generates output predictions for the input samples.
#
# Computation is done in batches. This method is designed for batch processing
# of large numbers of inputs. It is not intended for use inside of loops
# that iterate over your data and process small numbers of inputs at a time.
#
# For small numbers of inputs that fit in one batch,
# directly use `__call__()` for faster execution, e.g.,
# `model(x)`, or `model(x, training=False)` if you have layers such as
# `tf.keras.layers.BatchNormalization` that behave differently during
# inference. You may pair the individual model call with a `tf.function`
# for additional performance inside your inner loop.
# If you need access to numpy array values instead of tensors after your
# model call, you can use `tensor.numpy()` to get the numpy array value of
# an eager tensor.
#
# Also, note the fact that test loss is not affected by
# regularization layers like noise and dropout.
#
# Note: See [this FAQ entry](
# https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
# for more details about the difference between `Model` methods `predict()`
# and `__call__()`.
#
# Args:
#     x: Input samples. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A `tf.data` dataset.
#       - A generator or `keras.utils.Sequence` instance.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given in the `Unpacking behavior
#       for iterator-like inputs` section of `Model.fit`.
#     batch_size: Integer or `None`.
#         Number of samples per batch.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of dataset, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     verbose: Verbosity mode, 0 or 1.
#     steps: Total number of steps (batches of samples)
#         before declaring the prediction round finished.
#         Ignored with the default value of `None`. If x is a `tf.data`
#         dataset and `steps` is None, `predict()` will
#         run until the input dataset is exhausted.
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during prediction.
#         See [callbacks](/api_docs/python/tf/keras/callbacks).
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up when using
#         process-based threading. If unspecified, `workers` will default
#         to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# See the discussion of `Unpacking behavior for iterator-like inputs` for
# `Model.fit`. Note that Model.predict uses the same interpretation rules as
# `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for all
# three methods.
#
# Returns:
#     Numpy array(s) of predictions.
#
# Raises:
#     RuntimeError: If `model.predict` is wrapped in a `tf.function`.
#     ValueError: In case of mismatch between the provided
#         input data and the model's expectations,
#         or in case a stateful model receives a number of samples
#         that is not a multiple of the batch size.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>sklearn</b>
# <ul>
# <li>
# <details><summary><u>sklearn.metrics._regression.mean_squared_error</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Mean squared error regression loss.
#
# Read more in the :ref:`User Guide <mean_squared_error>`.
#
# Parameters
# ----------
# y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
#     Ground truth (correct) target values.
#
# y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
#     Estimated target values.
#
# sample_weight : array-like of shape (n_samples,), default=None
#     Sample weights.
#
# multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'
#     Defines aggregating of multiple output values.
#     Array-like value defines weights used to average errors.
#
#     'raw_values' :
#         Returns a full set of errors in case of multioutput input.
#
#     'uniform_average' :
#         Errors of all outputs are averaged with uniform weight.
#
# squared : bool, default=True
#     If True returns MSE value, if False returns RMSE value.
#
# Returns
# -------
# loss : float or ndarray of floats
#     A non-negative floating point value (the best value is 0.0), or an
#     array of floating point values, one for each individual target.
#
# Examples
# --------
# >>> from sklearn.metrics import mean_squared_error
# >>> y_true = [3, -0.5, 2, 7]
# >>> y_pred = [2.5, 0.0, 2, 8]
# >>> mean_squared_error(y_true, y_pred)
# 0.375
# >>> y_true = [3, -0.5, 2, 7]
# >>> y_pred = [2.5, 0.0, 2, 8]
# >>> mean_squared_error(y_true, y_pred, squared=False)
# 0.612...
# >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]
# >>> y_pred = [[0, 2],[-1, 2],[8, -5]]
# >>> mean_squared_error(y_true, y_pred)
# 0.708...
# >>> mean_squared_error(y_true, y_pred, squared=False)
# 0.822...
# >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')
# array([0.41666667, 1.        ])
# >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])
# 0.825...
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 44</u></h3></summary><small><a href=#44>goto cell # 44</a></small>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.summary</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Prints a string summary of the network.
#
# Args:
#     line_length: Total length of printed lines
#         (e.g. set this to adapt the display to different
#         terminal window sizes).
#     positions: Relative or absolute positions of log elements
#         in each line. If not provided,
#         defaults to `[.33, .55, .67, 1.]`.
#     print_fn: Print function to use. Defaults to `print`.
#         It will be called on each line of the summary.
#         You can set it to a custom function
#         in order to capture the string summary.
#     expand_nested: Whether to expand the nested models.
#         If not provided, defaults to `False`.
#     show_trainable: Whether to show if a layer is trainable.
#         If not provided, defaults to `False`.
#
# Raises:
#     ValueError: if `summary()` is called before the model is built.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 47</u></h3></summary><small><a href=#47>goto cell # 47</a></small>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.predict</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Generates output predictions for the input samples.
#
# Computation is done in batches. This method is designed for batch processing
# of large numbers of inputs. It is not intended for use inside of loops
# that iterate over your data and process small numbers of inputs at a time.
#
# For small numbers of inputs that fit in one batch,
# directly use `__call__()` for faster execution, e.g.,
# `model(x)`, or `model(x, training=False)` if you have layers such as
# `tf.keras.layers.BatchNormalization` that behave differently during
# inference. You may pair the individual model call with a `tf.function`
# for additional performance inside your inner loop.
# If you need access to numpy array values instead of tensors after your
# model call, you can use `tensor.numpy()` to get the numpy array value of
# an eager tensor.
#
# Also, note the fact that test loss is not affected by
# regularization layers like noise and dropout.
#
# Note: See [this FAQ entry](
# https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
# for more details about the difference between `Model` methods `predict()`
# and `__call__()`.
#
# Args:
#     x: Input samples. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A `tf.data` dataset.
#       - A generator or `keras.utils.Sequence` instance.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given in the `Unpacking behavior
#       for iterator-like inputs` section of `Model.fit`.
#     batch_size: Integer or `None`.
#         Number of samples per batch.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of dataset, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     verbose: Verbosity mode, 0 or 1.
#     steps: Total number of steps (batches of samples)
#         before declaring the prediction round finished.
#         Ignored with the default value of `None`. If x is a `tf.data`
#         dataset and `steps` is None, `predict()` will
#         run until the input dataset is exhausted.
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during prediction.
#         See [callbacks](/api_docs/python/tf/keras/callbacks).
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up when using
#         process-based threading. If unspecified, `workers` will default
#         to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# See the discussion of `Unpacking behavior for iterator-like inputs` for
# `Model.fit`. Note that Model.predict uses the same interpretation rules as
# `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for all
# three methods.
#
# Returns:
#     Numpy array(s) of predictions.
#
# Raises:
#     RuntimeError: If `model.predict` is wrapped in a `tf.function`.
#     ValueError: In case of mismatch between the provided
#         input data and the model's expectations,
#         or in case a stateful model receives a number of samples
#         that is not a multiple of the batch size.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>sklearn</b>
# <ul>
# <li>
# <details><summary><u>sklearn.metrics._regression.mean_squared_error</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Mean squared error regression loss.
#
# Read more in the :ref:`User Guide <mean_squared_error>`.
#
# Parameters
# ----------
# y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
#     Ground truth (correct) target values.
#
# y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
#     Estimated target values.
#
# sample_weight : array-like of shape (n_samples,), default=None
#     Sample weights.
#
# multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'
#     Defines aggregating of multiple output values.
#     Array-like value defines weights used to average errors.
#
#     'raw_values' :
#         Returns a full set of errors in case of multioutput input.
#
#     'uniform_average' :
#         Errors of all outputs are averaged with uniform weight.
#
# squared : bool, default=True
#     If True returns MSE value, if False returns RMSE value.
#
# Returns
# -------
# loss : float or ndarray of floats
#     A non-negative floating point value (the best value is 0.0), or an
#     array of floating point values, one for each individual target.
#
# Examples
# --------
# >>> from sklearn.metrics import mean_squared_error
# >>> y_true = [3, -0.5, 2, 7]
# >>> y_pred = [2.5, 0.0, 2, 8]
# >>> mean_squared_error(y_true, y_pred)
# 0.375
# >>> y_true = [3, -0.5, 2, 7]
# >>> y_pred = [2.5, 0.0, 2, 8]
# >>> mean_squared_error(y_true, y_pred, squared=False)
# 0.612...
# >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]
# >>> y_pred = [[0, 2],[-1, 2],[8, -5]]
# >>> mean_squared_error(y_true, y_pred)
# 0.708...
# >>> mean_squared_error(y_true, y_pred, squared=False)
# 0.822...
# >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')
# array([0.41666667, 1.        ])
# >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])
# 0.825...
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
# <li><details open><summary><h3><u>Cell # 48</u></h3></summary><small><a href=#48>goto cell # 48</a></small>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.predict</u> | (No Args Found) </summary>
# <blockquote>
# <code>
# Generates output predictions for the input samples.
#
# Computation is done in batches. This method is designed for batch processing
# of large numbers of inputs. It is not intended for use inside of loops
# that iterate over your data and process small numbers of inputs at a time.
#
# For small numbers of inputs that fit in one batch,
# directly use `__call__()` for faster execution, e.g.,
# `model(x)`, or `model(x, training=False)` if you have layers such as
# `tf.keras.layers.BatchNormalization` that behave differently during
# inference. You may pair the individual model call with a `tf.function`
# for additional performance inside your inner loop.
# If you need access to numpy array values instead of tensors after your
# model call, you can use `tensor.numpy()` to get the numpy array value of
# an eager tensor.
#
# Also, note the fact that test loss is not affected by
# regularization layers like noise and dropout.
#
# Note: See [this FAQ entry](
# https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
# for more details about the difference between `Model` methods `predict()`
# and `__call__()`.
#
# Args:
#     x: Input samples. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A `tf.data` dataset.
#       - A generator or `keras.utils.Sequence` instance.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given in the `Unpacking behavior
#       for iterator-like inputs` section of `Model.fit`.
#     batch_size: Integer or `None`.
#         Number of samples per batch.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of dataset, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     verbose: Verbosity mode, 0 or 1.
#     steps: Total number of steps (batches of samples)
#         before declaring the prediction round finished.
#         Ignored with the default value of `None`. If x is a `tf.data`
#         dataset and `steps` is None, `predict()` will
#         run until the input dataset is exhausted.
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during prediction.
#         See [callbacks](/api_docs/python/tf/keras/callbacks).
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up when using
#         process-based threading. If unspecified, `workers` will default
#         to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# See the discussion of `Unpacking behavior for iterator-like inputs` for
# `Model.fit`. Note that Model.predict uses the same interpretation rules as
# `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for all
# three methods.
#
# Returns:
#     Numpy array(s) of predictions.
#
# Raises:
#     RuntimeError: If `model.predict` is wrapped in a `tf.function`.
#     ValueError: In case of mismatch between the provided
#         input data and the model's expectations,
#         or in case a stateful model receives a number of samples
#         that is not a multiple of the batch size.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details></li>
#
# </ul>
# </details></li></ul>
# </ul>
# <hr>
#
# <details><summary><h2>View All ML API Calls in Notebook</h2></summary>
# <ul>
#
# <li> <b>keras</b>
# <ul>
# <li>
# <details><summary><u>keras</u></summary>
# <blockquote>
# <code>
# Implementation of the Keras API, the high-level API of TensorFlow.
#
# Detailed documentation and user guides are available at
# [keras.io](https://keras.io).
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.sequential.Sequential</u></summary>
# <blockquote>
# <code>
# `Sequential` groups a linear stack of layers into a `tf.keras.Model`.
#
# `Sequential` provides training and inference features on this model.
#
# Examples:
#
# ```python
# Optionally, the first layer can receive an `input_shape` argument:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
# Afterwards, we do automatic shape inference:
# model.add(tf.keras.layers.Dense(4))
#
# This is identical to the following:
# model = tf.keras.Sequential()
# model.add(tf.keras.Input(shape=(16,)))
# model.add(tf.keras.layers.Dense(8))
#
# Note that you can also omit the `input_shape` argument.
# In that case the model doesn't have any weights until the first call
# to a training/evaluation method (since it isn't yet built):
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(4))
# model.weights not created yet
#
# Whereas if you specify the input shape, the model gets built
# continuously as you are adding layers:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
# model.add(tf.keras.layers.Dense(4))
# len(model.weights)
# Returns "4"
#
# When using the delayed-build pattern (no input shape specified), you can
# choose to manually build your model by calling
# `build(batch_input_shape)`:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(4))
# model.build((None, 16))
# len(model.weights)
# Returns "4"
#
# Note that when using the delayed-build pattern (no input shape specified),
# the model gets built the first time you call `fit`, `eval`, or `predict`,
# or the first time you call the model on some input data.
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(1))
# model.compile(optimizer='sgd', loss='mse')
# This builds the model for the first time:
# model.fit(x, y, batch_size=32, epochs=10)
# ```
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.sequential.Sequential.add</u></summary>
# <blockquote>
# <code>
# Adds a layer instance on top of the layer stack.
#
# Args:
#     layer: layer instance.
#
# Raises:
#     TypeError: If `layer` is not a layer instance.
#     ValueError: In case the `layer` argument does not
#         know its input shape.
#     ValueError: In case the `layer` argument has
#         multiple output tensors, or is already connected
#         somewhere else (forbidden in `Sequential` models).
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.training.Model.compile</u></summary>
# <blockquote>
# <code>
# Configures the model for training.
#
# Example:
#
# ```python
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
#               loss=tf.keras.losses.BinaryCrossentropy(),
#               metrics=[tf.keras.metrics.BinaryAccuracy(),
#                        tf.keras.metrics.FalseNegatives()])
# ```
#
# Args:
#     optimizer: String (name of optimizer) or optimizer instance. See
#       `tf.keras.optimizers`.
#     loss: Loss function. Maybe be a string (name of loss function), or
#       a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss
#       function is any callable with the signature `loss = fn(y_true,
#       y_pred)`, where `y_true` are the ground truth values, and
#       `y_pred` are the model's predictions.
#       `y_true` should have shape
#       `(batch_size, d0, .. dN)` (except in the case of
#       sparse loss functions such as
#       sparse categorical crossentropy which expects integer arrays of shape
#       `(batch_size, d0, .. dN-1)`).
#       `y_pred` should have shape `(batch_size, d0, .. dN)`.
#       The loss function should return a float tensor.
#       If a custom `Loss` instance is
#       used and reduction is set to `None`, return value has shape
#       `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss
#       values; otherwise, it is a scalar. If the model has multiple outputs,
#       you can use a different loss on each output by passing a dictionary
#       or a list of losses. The loss value that will be minimized by the
#       model will then be the sum of all individual losses, unless
#       `loss_weights` is specified.
#     metrics: List of metrics to be evaluated by the model during training
#       and testing. Each of this can be a string (name of a built-in
#       function), function or a `tf.keras.metrics.Metric` instance. See
#       `tf.keras.metrics`. Typically you will use `metrics=['accuracy']`. A
#       function is any callable with the signature `result = fn(y_true,
#       y_pred)`. To specify different metrics for different outputs of a
#       multi-output model, you could also pass a dictionary, such as
#       `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
#       You can also pass a list to specify a metric or a list of metrics
#       for each output, such as `metrics=[['accuracy'], ['accuracy', 'mse']]`
#       or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the
#       strings 'accuracy' or 'acc', we convert this to one of
#       `tf.keras.metrics.BinaryAccuracy`,
#       `tf.keras.metrics.CategoricalAccuracy`,
#       `tf.keras.metrics.SparseCategoricalAccuracy` based on the loss
#       function used and the model output shape. We do a similar
#       conversion for the strings 'crossentropy' and 'ce' as well.
#     loss_weights: Optional list or dictionary specifying scalar coefficients
#       (Python floats) to weight the loss contributions of different model
#       outputs. The loss value that will be minimized by the model will then
#       be the *weighted sum* of all individual losses, weighted by the
#       `loss_weights` coefficients.
#         If a list, it is expected to have a 1:1 mapping to the model's
#           outputs. If a dict, it is expected to map output names (strings)
#           to scalar coefficients.
#     weighted_metrics: List of metrics to be evaluated and weighted by
#       `sample_weight` or `class_weight` during training and testing.
#     run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s
#       logic will not be wrapped in a `tf.function`. Recommended to leave
#       this as `None` unless your `Model` cannot be run inside a
#       `tf.function`. `run_eagerly=True` is not supported when using
#       `tf.distribute.experimental.ParameterServerStrategy`.
#     steps_per_execution: Int. Defaults to 1. The number of batches to run
#       during each `tf.function` call. Running multiple batches inside a
#       single `tf.function` call can greatly improve performance on TPUs or
#       small models with a large Python overhead. At most, one full epoch
#       will be run each execution. If a number larger than the size of the
#       epoch is passed, the execution will be truncated to the size of the
#       epoch. Note that if `steps_per_execution` is set to `N`,
#       `Callback.on_batch_begin` and `Callback.on_batch_end` methods will
#       only be called every `N` batches (i.e. before/after each `tf.function`
#       execution).
#     jit_compile: If `True`, compile the model training step with XLA.
#       [XLA](https://www.tensorflow.org/xla) is an optimizing compiler for
#       machine learning.
#       `jit_compile` is not enabled for by default.
#       This option cannot be enabled with `run_eagerly=True`.
#       Note that `jit_compile=True` is
#       may not necessarily work for all models.
#       For more information on supported operations please refer to the
#       [XLA documentation](https://www.tensorflow.org/xla).
#       Also refer to
#       [known XLA issues](https://www.tensorflow.org/xla/known_issues) for
#       more details.
#     **kwargs: Arguments supported for backwards compatibility only.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.training.Model.fit</u></summary>
# <blockquote>
# <code>
# Trains the model for a fixed number of epochs (iterations on a dataset).
#
# Args:
#     x: Input data. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A dict mapping input names to the corresponding array/tensors,
#         if the model has named inputs.
#       - A `tf.data` dataset. Should return a tuple
#         of either `(inputs, targets)` or
#         `(inputs, targets, sample_weights)`.
#       - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
#         or `(inputs, targets, sample_weights)`.
#       - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a
#         callable that takes a single argument of type
#         `tf.distribute.InputContext`, and returns a `tf.data.Dataset`.
#         `DatasetCreator` should be used when users prefer to specify the
#         per-replica batching and sharding logic for the `Dataset`.
#         See `tf.keras.utils.experimental.DatasetCreator` doc for more
#         information.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given below. If using
#       `tf.distribute.experimental.ParameterServerStrategy`, only
#       `DatasetCreator` type is supported for `x`.
#     y: Target data. Like the input data `x`,
#       it could be either Numpy array(s) or TensorFlow tensor(s).
#       It should be consistent with `x` (you cannot have Numpy inputs and
#       tensor targets, or inversely). If `x` is a dataset, generator,
#       or `keras.utils.Sequence` instance, `y` should
#       not be specified (since targets will be obtained from `x`).
#     batch_size: Integer or `None`.
#         Number of samples per gradient update.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of datasets, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     epochs: Integer. Number of epochs to train the model.
#         An epoch is an iteration over the entire `x` and `y`
#         data provided
#         (unless the `steps_per_epoch` flag is set to
#         something other than None).
#         Note that in conjunction with `initial_epoch`,
#         `epochs` is to be understood as "final epoch".
#         The model is not trained for a number of iterations
#         given by `epochs`, but merely until the epoch
#         of index `epochs` is reached.
#     verbose: 'auto', 0, 1, or 2. Verbosity mode.
#         0 = silent, 1 = progress bar, 2 = one line per epoch.
#         'auto' defaults to 1 for most cases, but 2 when used with
#         `ParameterServerStrategy`. Note that the progress bar is not
#         particularly useful when logged to a file, so verbose=2 is
#         recommended when not running interactively (eg, in a production
#         environment).
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during training.
#         See `tf.keras.callbacks`. Note `tf.keras.callbacks.ProgbarLogger`
#         and `tf.keras.callbacks.History` callbacks are created automatically
#         and need not be passed into `model.fit`.
#         `tf.keras.callbacks.ProgbarLogger` is created or not based on
#         `verbose` argument to `model.fit`.
#         Callbacks with batch-level calls are currently unsupported with
#         `tf.distribute.experimental.ParameterServerStrategy`, and users are
#         advised to implement epoch-level calls instead with an appropriate
#         `steps_per_epoch` value.
#     validation_split: Float between 0 and 1.
#         Fraction of the training data to be used as validation data.
#         The model will set apart this fraction of the training data,
#         will not train on it, and will evaluate
#         the loss and any model metrics
#         on this data at the end of each epoch.
#         The validation data is selected from the last samples
#         in the `x` and `y` data provided, before shuffling. This argument is
#         not supported when `x` is a dataset, generator or
#        `keras.utils.Sequence` instance.
#         `validation_split` is not yet supported with
#         `tf.distribute.experimental.ParameterServerStrategy`.
#     validation_data: Data on which to evaluate
#         the loss and any model metrics at the end of each epoch.
#         The model will not be trained on this data. Thus, note the fact
#         that the validation loss of data provided using `validation_split`
#         or `validation_data` is not affected by regularization layers like
#         noise and dropout.
#         `validation_data` will override `validation_split`.
#         `validation_data` could be:
#           - A tuple `(x_val, y_val)` of Numpy arrays or tensors.
#           - A tuple `(x_val, y_val, val_sample_weights)` of NumPy arrays.
#           - A `tf.data.Dataset`.
#           - A Python generator or `keras.utils.Sequence` returning
#           `(inputs, targets)` or `(inputs, targets, sample_weights)`.
#         `validation_data` is not yet supported with
#         `tf.distribute.experimental.ParameterServerStrategy`.
#     shuffle: Boolean (whether to shuffle the training data
#         before each epoch) or str (for 'batch'). This argument is ignored
#         when `x` is a generator or an object of tf.data.Dataset.
#         'batch' is a special option for dealing
#         with the limitations of HDF5 data; it shuffles in batch-sized
#         chunks. Has no effect when `steps_per_epoch` is not `None`.
#     class_weight: Optional dictionary mapping class indices (integers)
#         to a weight (float) value, used for weighting the loss function
#         (during training only).
#         This can be useful to tell the model to
#         "pay more attention" to samples from
#         an under-represented class.
#     sample_weight: Optional Numpy array of weights for
#         the training samples, used for weighting the loss function
#         (during training only). You can either pass a flat (1D)
#         Numpy array with the same length as the input samples
#         (1:1 mapping between weights and samples),
#         or in the case of temporal data,
#         you can pass a 2D array with shape
#         `(samples, sequence_length)`,
#         to apply a different weight to every timestep of every sample. This
#         argument is not supported when `x` is a dataset, generator, or
#        `keras.utils.Sequence` instance, instead provide the sample_weights
#         as the third element of `x`.
#     initial_epoch: Integer.
#         Epoch at which to start training
#         (useful for resuming a previous training run).
#     steps_per_epoch: Integer or `None`.
#         Total number of steps (batches of samples)
#         before declaring one epoch finished and starting the
#         next epoch. When training with input tensors such as
#         TensorFlow data tensors, the default `None` is equal to
#         the number of samples in your dataset divided by
#         the batch size, or 1 if that cannot be determined. If x is a
#         `tf.data` dataset, and 'steps_per_epoch'
#         is None, the epoch will run until the input dataset is exhausted.
#         When passing an infinitely repeating dataset, you must specify the
#         `steps_per_epoch` argument. If `steps_per_epoch=-1` the training
#         will run indefinitely with an infinitely repeating dataset.
#         This argument is not supported with array inputs.
#         When using `tf.distribute.experimental.ParameterServerStrategy`:
#           * `steps_per_epoch=None` is not supported.
#     validation_steps: Only relevant if `validation_data` is provided and
#         is a `tf.data` dataset. Total number of steps (batches of
#         samples) to draw before stopping when performing validation
#         at the end of every epoch. If 'validation_steps' is None, validation
#         will run until the `validation_data` dataset is exhausted. In the
#         case of an infinitely repeated dataset, it will run into an
#         infinite loop. If 'validation_steps' is specified and only part of
#         the dataset will be consumed, the evaluation will start from the
#         beginning of the dataset at each epoch. This ensures that the same
#         validation samples are used every time.
#     validation_batch_size: Integer or `None`.
#         Number of samples per validation batch.
#         If unspecified, will default to `batch_size`.
#         Do not specify the `validation_batch_size` if your data is in the
#         form of datasets, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     validation_freq: Only relevant if validation data is provided. Integer
#         or `collections.abc.Container` instance (e.g. list, tuple, etc.).
#         If an integer, specifies how many training epochs to run before a
#         new validation run is performed, e.g. `validation_freq=2` runs
#         validation every 2 epochs. If a Container, specifies the epochs on
#         which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
#         validation at the end of the 1st, 2nd, and 10th epochs.
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up
#         when using process-based threading. If unspecified, `workers`
#         will default to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# Unpacking behavior for iterator-like inputs:
#     A common pattern is to pass a tf.data.Dataset, generator, or
#   tf.keras.utils.Sequence to the `x` argument of fit, which will in fact
#   yield not only features (x) but optionally targets (y) and sample weights.
#   Keras requires that the output of such iterator-likes be unambiguous. The
#   iterator should return a tuple of length 1, 2, or 3, where the optional
#   second and third elements will be used for y and sample_weight
#   respectively. Any other type provided will be wrapped in a length one
#   tuple, effectively treating everything as 'x'. When yielding dicts, they
#   should still adhere to the top-level tuple structure.
#   e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate
#   features, targets, and weights from the keys of a single dict.
#     A notable unsupported data type is the namedtuple. The reason is that
#   it behaves like both an ordered datatype (tuple) and a mapping
#   datatype (dict). So given a namedtuple of the form:
#       `namedtuple("example_tuple", ["y", "x"])`
#   it is ambiguous whether to reverse the order of the elements when
#   interpreting the value. Even worse is a tuple of the form:
#       `namedtuple("other_tuple", ["x", "y", "z"])`
#   where it is unclear if the tuple was intended to be unpacked into x, y,
#   and sample_weight or passed through as a single element to `x`. As a
#   result the data processing code will simply raise a ValueError if it
#   encounters a namedtuple. (Along with instructions to remedy the issue.)
#
# Returns:
#     A `History` object. Its `History.history` attribute is
#     a record of training loss values and metrics values
#     at successive epochs, as well as validation loss values
#     and validation metrics values (if applicable).
#
# Raises:
#     RuntimeError: 1. If the model was never compiled or,
#     2. If `model.fit` is  wrapped in `tf.function`.
#
#     ValueError: In case of mismatch between the provided input data
#         and what the model expects or when the input data is empty.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.training.Model.predict</u></summary>
# <blockquote>
# <code>
# Generates output predictions for the input samples.
#
# Computation is done in batches. This method is designed for batch processing
# of large numbers of inputs. It is not intended for use inside of loops
# that iterate over your data and process small numbers of inputs at a time.
#
# For small numbers of inputs that fit in one batch,
# directly use `__call__()` for faster execution, e.g.,
# `model(x)`, or `model(x, training=False)` if you have layers such as
# `tf.keras.layers.BatchNormalization` that behave differently during
# inference. You may pair the individual model call with a `tf.function`
# for additional performance inside your inner loop.
# If you need access to numpy array values instead of tensors after your
# model call, you can use `tensor.numpy()` to get the numpy array value of
# an eager tensor.
#
# Also, note the fact that test loss is not affected by
# regularization layers like noise and dropout.
#
# Note: See [this FAQ entry](
# https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
# for more details about the difference between `Model` methods `predict()`
# and `__call__()`.
#
# Args:
#     x: Input samples. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A `tf.data` dataset.
#       - A generator or `keras.utils.Sequence` instance.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given in the `Unpacking behavior
#       for iterator-like inputs` section of `Model.fit`.
#     batch_size: Integer or `None`.
#         Number of samples per batch.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of dataset, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     verbose: Verbosity mode, 0 or 1.
#     steps: Total number of steps (batches of samples)
#         before declaring the prediction round finished.
#         Ignored with the default value of `None`. If x is a `tf.data`
#         dataset and `steps` is None, `predict()` will
#         run until the input dataset is exhausted.
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during prediction.
#         See [callbacks](/api_docs/python/tf/keras/callbacks).
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up when using
#         process-based threading. If unspecified, `workers` will default
#         to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# See the discussion of `Unpacking behavior for iterator-like inputs` for
# `Model.fit`. Note that Model.predict uses the same interpretation rules as
# `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for all
# three methods.
#
# Returns:
#     Numpy array(s) of predictions.
#
# Raises:
#     RuntimeError: If `model.predict` is wrapped in a `tf.function`.
#     ValueError: In case of mismatch between the provided
#         input data and the model's expectations,
#         or in case a stateful model receives a number of samples
#         that is not a multiple of the batch size.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.training.Model.summary</u></summary>
# <blockquote>
# <code>
# Prints a string summary of the network.
#
# Args:
#     line_length: Total length of printed lines
#         (e.g. set this to adapt the display to different
#         terminal window sizes).
#     positions: Relative or absolute positions of log elements
#         in each line. If not provided,
#         defaults to `[.33, .55, .67, 1.]`.
#     print_fn: Print function to use. Defaults to `print`.
#         It will be called on each line of the summary.
#         You can set it to a custom function
#         in order to capture the string summary.
#     expand_nested: Whether to expand the nested models.
#         If not provided, defaults to `False`.
#     show_trainable: Whether to show if a layer is trainable.
#         If not provided, defaults to `False`.
#
# Raises:
#     ValueError: if `summary()` is called before the model is built.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.layers.core.dense.Dense</u></summary>
# <blockquote>
# <code>
# Just your regular densely-connected NN layer.
#
# `Dense` implements the operation:
# `output = activation(dot(input, kernel) + bias)`
# where `activation` is the element-wise activation function
# passed as the `activation` argument, `kernel` is a weights matrix
# created by the layer, and `bias` is a bias vector created by the layer
# (only applicable if `use_bias` is `True`). These are all attributes of
# `Dense`.
#
# Note: If the input to the layer has a rank greater than 2, then `Dense`
# computes the dot product between the `inputs` and the `kernel` along the
# last axis of the `inputs` and axis 0 of the `kernel` (using `tf.tensordot`).
# For example, if input has dimensions `(batch_size, d0, d1)`,
# then we create a `kernel` with shape `(d1, units)`, and the `kernel` operates
# along axis 2 of the `input`, on every sub-tensor of shape `(1, 1, d1)`
# (there are `batch_size * d0` such sub-tensors).
# The output in this case will have shape `(batch_size, d0, units)`.
#
# Besides, layer attributes cannot be modified after the layer has been called
# once (except the `trainable` attribute).
# When a popular kwarg `input_shape` is passed, then keras will create
# an input layer to insert before the current layer. This can be treated
# equivalent to explicitly defining an `InputLayer`.
#
# Example:
#
# >>> # Create a `Sequential` model and add a Dense layer as the first layer.
# >>> model = tf.keras.models.Sequential()
# >>> model.add(tf.keras.Input(shape=(16,)))
# >>> model.add(tf.keras.layers.Dense(32, activation='relu'))
# >>> # Now the model will take as input arrays of shape (None, 16)
# >>> # and output arrays of shape (None, 32).
# >>> # Note that after the first layer, you don't need to specify
# >>> # the size of the input anymore:
# >>> model.add(tf.keras.layers.Dense(32))
# >>> model.output_shape
# (None, 32)
#
# Args:
#   units: Positive integer, dimensionality of the output space.
#   activation: Activation function to use.
#     If you don't specify anything, no activation is applied
#     (ie. "linear" activation: `a(x) = x`).
#   use_bias: Boolean, whether the layer uses a bias vector.
#   kernel_initializer: Initializer for the `kernel` weights matrix.
#   bias_initializer: Initializer for the bias vector.
#   kernel_regularizer: Regularizer function applied to
#     the `kernel` weights matrix.
#   bias_regularizer: Regularizer function applied to the bias vector.
#   activity_regularizer: Regularizer function applied to
#     the output of the layer (its "activation").
#   kernel_constraint: Constraint function applied to
#     the `kernel` weights matrix.
#   bias_constraint: Constraint function applied to the bias vector.
#
# Input shape:
#   N-D tensor with shape: `(batch_size, ..., input_dim)`.
#   The most common situation would be
#   a 2D input with shape `(batch_size, input_dim)`.
#
# Output shape:
#   N-D tensor with shape: `(batch_size, ..., units)`.
#   For instance, for a 2D input with shape `(batch_size, input_dim)`,
#   the output would have shape `(batch_size, units)`.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>matplotlib</b>
# <ul>
# <li>
# <details><summary><u>matplotlib.pyplot</u></summary>
# <blockquote>
# <code>
# `matplotlib.pyplot` is a state-based interface to matplotlib. It provides
# an implicit,  MATLAB-like, way of plotting.  It also opens figures on your
# screen, and acts as the figure GUI manager.
#
# pyplot is mainly intended for interactive plots and simple cases of
# programmatic plot generation::
#
#     import numpy as np
#     import matplotlib.pyplot as plt
#
#     x = np.arange(0, 5, 0.1)
#     y = np.sin(x)
#     plt.plot(x, y)
#
# The explicit (object-oriented) API is recommended for complex plots, though
# pyplot is still usually used to create the figure and often the axes in the
# figure. See `.pyplot.figure`, `.pyplot.subplots`, and
# `.pyplot.subplot_mosaic` to create figures, and
# :doc:`Axes API <../axes_api>` for the plotting methods on an axes::
#
#     import numpy as np
#     import matplotlib.pyplot as plt
#
#     x = np.arange(0, 5, 0.1)
#     y = np.sin(x)
#     fig, ax = plt.subplots()
#     ax.plot(x, y)
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>numpy</b>
# <ul>
# <li>
# <details><summary><u>numpy</u></summary>
# <blockquote>
# <code>
# NumPy
# =====
#
# Provides
#   1. An array object of arbitrary homogeneous items
#   2. Fast mathematical operations over arrays
#   3. Linear Algebra, Fourier Transforms, Random Number Generation
#
# How to use the documentation
# ----------------------------
# Documentation is available in two forms: docstrings provided
# with the code, and a loose standing reference guide, available from
# `the NumPy homepage <https://www.scipy.org>`_.
#
# We recommend exploring the docstrings using
# `IPython <https://ipython.org>`_, an advanced Python shell with
# TAB-completion and introspection capabilities.  See below for further
# instructions.
#
# The docstring examples assume that `numpy` has been imported as `np`::
#
#   >>> import numpy as np
#
# Code snippets are indicated by three greater-than signs::
#
#   >>> x = 42
#   >>> x = x + 1
#
# Use the built-in ``help`` function to view a function's docstring::
#
#   >>> help(np.sort)
#   ... # doctest: +SKIP
#
# For some objects, ``np.info(obj)`` may provide additional help.  This is
# particularly true if you see the line "Help on ufunc object:" at the top
# of the help() page.  Ufuncs are implemented in C, not Python, for speed.
# The native Python help() does not know how to view their help, but our
# np.info() function does.
#
# To search for documents containing a keyword, do::
#
#   >>> np.lookfor('keyword')
#   ... # doctest: +SKIP
#
# General-purpose documents like a glossary and help on the basic concepts
# of numpy are available under the ``doc`` sub-module::
#
#   >>> from numpy import doc
#   >>> help(doc)
#   ... # doctest: +SKIP
#
# Available subpackages
# ---------------------
# doc
#     Topical documentation on broadcasting, indexing, etc.
# lib
#     Basic functions used by several sub-packages.
# random
#     Core Random Tools
# linalg
#     Core Linear Algebra Tools
# fft
#     Core FFT routines
# polynomial
#     Polynomial tools
# testing
#     NumPy testing tools
# f2py
#     Fortran to Python Interface Generator.
# distutils
#     Enhancements to distutils with support for
#     Fortran compilers support and more.
#
# Utilities
# ---------
# test
#     Run numpy unittests
# show_config
#     Show numpy build configuration
# dual
#     Overwrite certain functions with high-performance SciPy tools.
#     Note: `numpy.dual` is deprecated.  Use the functions from NumPy or Scipy
#     directly instead of importing them from `numpy.dual`.
# matlib
#     Make everything matrices.
# __version__
#     NumPy version string
#
# Viewing documentation using IPython
# -----------------------------------
# Start IPython with the NumPy profile (``ipython -p numpy``), which will
# import `numpy` under the alias `np`.  Then, use the ``cpaste`` command to
# paste examples into the shell.  To see which functions are available in
# `numpy`, type ``np.<TAB>`` (where ``<TAB>`` refers to the TAB key), or use
# ``np.*cos*?<ENTER>`` (where ``<ENTER>`` refers to the ENTER key) to narrow
# down the list.  To view the docstring for a function, use
# ``np.cos?<ENTER>`` (to view the docstring) and ``np.cos??<ENTER>`` (to view
# the source code).
#
# Copies vs. in-place operation
# -----------------------------
# Most of the functions in `numpy` return a copy of the array argument
# (e.g., `np.sort`).  In-place versions of these functions are often
# available as array methods, i.e. ``x = np.array([1,2,3]); x.sort()``.
# Exceptions to this rule are documented.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>numpy.ndarray</u></summary>
# <blockquote>
# <code>
# ndarray(shape, dtype=float, buffer=None, offset=0,
#         strides=None, order=None)
#
# An array object represents a multidimensional, homogeneous array
# of fixed-size items.  An associated data-type object describes the
# format of each element in the array (its byte-order, how many bytes it
# occupies in memory, whether it is an integer, a floating point number,
# or something else, etc.)
#
# Arrays should be constructed using `array`, `zeros` or `empty` (refer
# to the See Also section below).  The parameters given here refer to
# a low-level method (`ndarray(...)`) for instantiating an array.
#
# For more information, refer to the `numpy` module and examine the
# methods and attributes of an array.
#
# Parameters
# ----------
# (for the __new__ method; see Notes below)
#
# shape : tuple of ints
#     Shape of created array.
# dtype : data-type, optional
#     Any object that can be interpreted as a numpy data type.
# buffer : object exposing buffer interface, optional
#     Used to fill the array with data.
# offset : int, optional
#     Offset of array data in buffer.
# strides : tuple of ints, optional
#     Strides of data in memory.
# order : {'C', 'F'}, optional
#     Row-major (C-style) or column-major (Fortran-style) order.
#
# Attributes
# ----------
# T : ndarray
#     Transpose of the array.
# data : buffer
#     The array's elements, in memory.
# dtype : dtype object
#     Describes the format of the elements in the array.
# flags : dict
#     Dictionary containing information related to memory use, e.g.,
#     'C_CONTIGUOUS', 'OWNDATA', 'WRITEABLE', etc.
# flat : numpy.flatiter object
#     Flattened version of the array as an iterator.  The iterator
#     allows assignments, e.g., ``x.flat = 3`` (See `ndarray.flat` for
#     assignment examples; TODO).
# imag : ndarray
#     Imaginary part of the array.
# real : ndarray
#     Real part of the array.
# size : int
#     Number of elements in the array.
# itemsize : int
#     The memory use of each array element in bytes.
# nbytes : int
#     The total number of bytes required to store the array data,
#     i.e., ``itemsize * size``.
# ndim : int
#     The array's number of dimensions.
# shape : tuple of ints
#     Shape of the array.
# strides : tuple of ints
#     The step-size required to move from one element to the next in
#     memory. For example, a contiguous ``(3, 4)`` array of type
#     ``int16`` in C-order has strides ``(8, 2)``.  This implies that
#     to move from element to element in memory requires jumps of 2 bytes.
#     To move from row-to-row, one needs to jump 8 bytes at a time
#     (``2 * 4``).
# ctypes : ctypes object
#     Class containing properties of the array needed for interaction
#     with ctypes.
# base : ndarray
#     If the array is a view into another array, that array is its `base`
#     (unless that array is also a view).  The `base` array is where the
#     array data is actually stored.
#
# See Also
# --------
# array : Construct an array.
# zeros : Create an array, each element of which is zero.
# empty : Create an array, but leave its allocated memory unchanged (i.e.,
#         it contains "garbage").
# dtype : Create a data-type.
# numpy.typing.NDArray : An ndarray alias :term:`generic <generic type>`
#                        w.r.t. its `dtype.type <numpy.dtype.type>`.
#
# Notes
# -----
# There are two modes of creating an array using ``__new__``:
#
# 1. If `buffer` is None, then only `shape`, `dtype`, and `order`
#    are used.
# 2. If `buffer` is an object exposing the buffer interface, then
#    all keywords are interpreted.
#
# No ``__init__`` method is needed because the array is fully initialized
# after the ``__new__`` method.
#
# Examples
# --------
# These examples illustrate the low-level `ndarray` constructor.  Refer
# to the `See Also` section above for easier ways of constructing an
# ndarray.
#
# First mode, `buffer` is None:
#
# >>> np.ndarray(shape=(2,2), dtype=float, order='F')
# array([[0.0e+000, 0.0e+000], # random
#        [     nan, 2.5e-323]])
#
# Second mode:
#
# >>> np.ndarray((2,), buffer=np.array([1,2,3]),
# ...            offset=np.int_().itemsize,
# ...            dtype=int) # offset = 1*itemsize, i.e. skip first element
# array([2, 3])
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>numpy.ndarray.flatten</u></summary>
# <blockquote>
# <code>
# a.flatten(order='C')
#
# Return a copy of the array collapsed into one dimension.
#
# Parameters
# ----------
# order : {'C', 'F', 'A', 'K'}, optional
#     'C' means to flatten in row-major (C-style) order.
#     'F' means to flatten in column-major (Fortran-
#     style) order. 'A' means to flatten in column-major
#     order if `a` is Fortran *contiguous* in memory,
#     row-major order otherwise. 'K' means to flatten
#     `a` in the order the elements occur in memory.
#     The default is 'C'.
#
# Returns
# -------
# y : ndarray
#     A copy of the input array, flattened to one dimension.
#
# See Also
# --------
# ravel : Return a flattened array.
# flat : A 1-D flat iterator over the array.
#
# Examples
# --------
# >>> a = np.array([[1,2], [3,4]])
# >>> a.flatten()
# array([1, 2, 3, 4])
# >>> a.flatten('F')
# array([1, 3, 2, 4])
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>os</b>
# <ul>
# <li>
# <details><summary><u>os.walk</u></summary>
# <blockquote>
# <code>
# Directory tree generator.
#
# For each directory in the directory tree rooted at top (including top
# itself, but excluding '.' and '..'), yields a 3-tuple
#
#     dirpath, dirnames, filenames
#
# dirpath is a string, the path to the directory.  dirnames is a list of
# the names of the subdirectories in dirpath (excluding '.' and '..').
# filenames is a list of the names of the non-directory files in dirpath.
# Note that the names in the lists are just names, with no path components.
# To get a full path (which begins with top) to a file or directory in
# dirpath, do os.path.join(dirpath, name).
#
# If optional arg 'topdown' is true or not specified, the triple for a
# directory is generated before the triples for any of its subdirectories
# (directories are generated top down).  If topdown is false, the triple
# for a directory is generated after the triples for all of its
# subdirectories (directories are generated bottom up).
#
# When topdown is true, the caller can modify the dirnames list in-place
# (e.g., via del or slice assignment), and walk will only recurse into the
# subdirectories whose names remain in dirnames; this can be used to prune the
# search, or to impose a specific order of visiting.  Modifying dirnames when
# topdown is false has no effect on the behavior of os.walk(), since the
# directories in dirnames have already been generated by the time dirnames
# itself is generated. No matter the value of topdown, the list of
# subdirectories is retrieved before the tuples for the directory and its
# subdirectories are generated.
#
# By default errors from the os.scandir() call are ignored.  If
# optional arg 'onerror' is specified, it should be a function; it
# will be called with one argument, an OSError instance.  It can
# report the error to continue with the walk, or raise the exception
# to abort the walk.  Note that the filename is available as the
# filename attribute of the exception object.
#
# By default, os.walk does not follow symbolic links to subdirectories on
# systems that support them.  In order to get this functionality, set the
# optional argument 'followlinks' to true.
#
# Caution:  if you pass a relative pathname for top, don't change the
# current working directory between resumptions of walk.  walk never
# changes the current directory, and assumes that the client doesn't
# either.
#
# Example:
#
# import os
# from os.path import join, getsize
# for root, dirs, files in os.walk('python/Lib/email'):
#     print(root, "consumes", end="")
#     print(sum(getsize(join(root, name)) for name in files), end="")
#     print("bytes in", len(files), "non-directory files")
#     if 'CVS' in dirs:
#         dirs.remove('CVS')  # don't visit CVS directories
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>pandas</b>
# <ul>
# <li>
# <details><summary><u>pandas</u></summary>
# <blockquote>
# <code>
# pandas - a powerful data analysis and manipulation library for Python
# =====================================================================
#
# **pandas** is a Python package providing fast, flexible, and expressive data
# structures designed to make working with "relational" or "labeled" data both
# easy and intuitive. It aims to be the fundamental high-level building block for
# doing practical, **real world** data analysis in Python. Additionally, it has
# the broader goal of becoming **the most powerful and flexible open source data
# analysis / manipulation tool available in any language**. It is already well on
# its way toward this goal.
#
# Main Features
# -------------
# Here are just a few of the things that pandas does well:
#
#   - Easy handling of missing data in floating point as well as non-floating
#     point data.
#   - Size mutability: columns can be inserted and deleted from DataFrame and
#     higher dimensional objects
#   - Automatic and explicit data alignment: objects can be explicitly aligned
#     to a set of labels, or the user can simply ignore the labels and let
#     `Series`, `DataFrame`, etc. automatically align the data for you in
#     computations.
#   - Powerful, flexible group by functionality to perform split-apply-combine
#     operations on data sets, for both aggregating and transforming data.
#   - Make it easy to convert ragged, differently-indexed data in other Python
#     and NumPy data structures into DataFrame objects.
#   - Intelligent label-based slicing, fancy indexing, and subsetting of large
#     data sets.
#   - Intuitive merging and joining data sets.
#   - Flexible reshaping and pivoting of data sets.
#   - Hierarchical labeling of axes (possible to have multiple labels per tick).
#   - Robust IO tools for loading data from flat files (CSV and delimited),
#     Excel files, databases, and saving/loading data from the ultrafast HDF5
#     format.
#   - Time series-specific functionality: date range generation and frequency
#     conversion, moving window statistics, date shifting and lagging.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame</u></summary>
# <blockquote>
# <code>
# Two-dimensional, size-mutable, potentially heterogeneous tabular data.
#
# Data structure also contains labeled axes (rows and columns).
# Arithmetic operations align on both row and column labels. Can be
# thought of as a dict-like container for Series objects. The primary
# pandas data structure.
#
# Parameters
# ----------
# data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
#     Dict can contain Series, arrays, constants, dataclass or list-like objects. If
#     data is a dict, column order follows insertion-order. If a dict contains Series
#     which have an index defined, it is aligned by its index.
#
#     .. versionchanged:: 0.25.0
#        If data is a list of dicts, column order follows insertion-order.
#
# index : Index or array-like
#     Index to use for resulting frame. Will default to RangeIndex if
#     no indexing information part of input data and no index provided.
# columns : Index or array-like
#     Column labels to use for resulting frame when data does not have them,
#     defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
#     will perform column selection instead.
# dtype : dtype, default None
#     Data type to force. Only a single dtype is allowed. If None, infer.
# copy : bool or None, default None
#     Copy data from inputs.
#     For dict data, the default of None behaves like ``copy=True``.  For DataFrame
#     or 2d ndarray input, the default of None behaves like ``copy=False``.
#
#     .. versionchanged:: 1.3.0
#
# See Also
# --------
# DataFrame.from_records : Constructor from tuples, also record arrays.
# DataFrame.from_dict : From dicts of Series, arrays, or dicts.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_table : Read general delimited file into DataFrame.
# read_clipboard : Read text from clipboard into DataFrame.
#
# Examples
# --------
# Constructing DataFrame from a dictionary.
#
# >>> d = {'col1': [1, 2], 'col2': [3, 4]}
# >>> df = pd.DataFrame(data=d)
# >>> df
#    col1  col2
# 0     1     3
# 1     2     4
#
# Notice that the inferred dtype is int64.
#
# >>> df.dtypes
# col1    int64
# col2    int64
# dtype: object
#
# To enforce a single dtype:
#
# >>> df = pd.DataFrame(data=d, dtype=np.int8)
# >>> df.dtypes
# col1    int8
# col2    int8
# dtype: object
#
# Constructing DataFrame from a dictionary including Series:
#
# >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
# >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
#    col1  col2
# 0     0   NaN
# 1     1   NaN
# 2     2   2.0
# 3     3   3.0
#
# Constructing DataFrame from numpy ndarray:
#
# >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
# ...                    columns=['a', 'b', 'c'])
# >>> df2
#    a  b  c
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9
#
# Constructing DataFrame from a numpy ndarray that has labeled columns:
#
# >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
# ...                 dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
# >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
# ...
# >>> df3
#    c  a
# 0  3  1
# 1  6  4
# 2  9  7
#
# Constructing DataFrame from dataclass:
#
# >>> from dataclasses import make_dataclass
# >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
# >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
#    x  y
# 0  0  0
# 1  0  3
# 2  2  3
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u></summary>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.groupby</u></summary>
# <blockquote>
# <code>
# Group DataFrame using a mapper or by a Series of columns.
#
# A groupby operation involves some combination of splitting the
# object, applying a function, and combining the results. This can be
# used to group large amounts of data and compute operations on these
# groups.
#
# Parameters
# ----------
# by : mapping, function, label, or list of labels
#     Used to determine the groups for the groupby.
#     If ``by`` is a function, it's called on each value of the object's
#     index. If a dict or Series is passed, the Series or dict VALUES
#     will be used to determine the groups (the Series' values are first
#     aligned; see ``.align()`` method). If a list or ndarray of length
#     equal to the selected axis is passed (see the `groupby user guide
#     <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
#     the values are used as-is to determine the groups. A label or list
#     of labels may be passed to group by the columns in ``self``.
#     Notice that a tuple is interpreted as a (single) key.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Split along rows (0) or columns (1).
# level : int, level name, or sequence of such, default None
#     If the axis is a MultiIndex (hierarchical), group by a particular
#     level or levels.
# as_index : bool, default True
#     For aggregated output, return object with group labels as the
#     index. Only relevant for DataFrame input. as_index=False is
#     effectively "SQL-style" grouped output.
# sort : bool, default True
#     Sort group keys. Get better performance by turning this off.
#     Note this does not influence the order of observations within each
#     group. Groupby preserves the order of rows within each group.
# group_keys : bool, default True
#     When calling apply, add group keys to index to identify pieces.
# squeeze : bool, default False
#     Reduce the dimensionality of the return type if possible,
#     otherwise return a consistent type.
#
#     .. deprecated:: 1.1.0
#
# observed : bool, default False
#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.
# dropna : bool, default True
#     If True, and if group keys contain NA values, NA values together
#     with row/column will be dropped.
#     If False, NA values will also be treated as the key in groups.
#
#     .. versionadded:: 1.1.0
#
# Returns
# -------
# DataFrameGroupBy
#     Returns a groupby object that contains information about the groups.
#
# See Also
# --------
# resample : Convenience method for frequency conversion and resampling
#     of time series.
#
# Notes
# -----
# See the `user guide
# <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
# detailed usage and examples, including splitting an object into groups,
# iterating through groups, selecting a group, aggregation, and more.
#
# Examples
# --------
# >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
# ...                               'Parrot', 'Parrot'],
# ...                    'Max Speed': [380., 370., 24., 26.]})
# >>> df
#    Animal  Max Speed
# 0  Falcon      380.0
# 1  Falcon      370.0
# 2  Parrot       24.0
# 3  Parrot       26.0
# >>> df.groupby(['Animal']).mean()
#         Max Speed
# Animal
# Falcon      375.0
# Parrot       25.0
#
# **Hierarchical Indexes**
#
# We can groupby different levels of a hierarchical index
# using the `level` parameter:
#
# >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
# ...           ['Captive', 'Wild', 'Captive', 'Wild']]
# >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
# >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
# ...                   index=index)
# >>> df
#                 Max Speed
# Animal Type
# Falcon Captive      390.0
#        Wild         350.0
# Parrot Captive       30.0
#        Wild          20.0
# >>> df.groupby(level=0).mean()
#         Max Speed
# Animal
# Falcon      370.0
# Parrot       25.0
# >>> df.groupby(level="Type").mean()
#          Max Speed
# Type
# Captive      210.0
# Wild         185.0
#
# We can also choose to include NA in group keys or not by setting
# `dropna` parameter, the default setting is `True`.
#
# >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by=["b"]).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
#
# >>> df.groupby(by=["b"], dropna=False).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
# NaN 1   4
#
# >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by="a").sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
#
# >>> df.groupby(by="a", dropna=False).sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
# NaN 12.3   33.0
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.info</u></summary>
# <blockquote>
# <code>
# Print a concise summary of a DataFrame.
#
# This method prints information about a DataFrame including
# the index dtype and columns, non-null values and memory usage.
#
# Parameters
# ----------
# data : DataFrame
#     DataFrame to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    max_cols : int, optional
#     When to switch from the verbose to the truncated output. If the
#     DataFrame has more than `max_cols` columns, the truncated output
#     is used. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is used.
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the DataFrame
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
# null_counts : bool, optional
#     .. deprecated:: 1.2.0
#         Use show_counts instead.
#
# Returns
# -------
# None
#     This method prints a summary of a DataFrame and returns None.
#
# See Also
# --------
# DataFrame.describe: Generate descriptive statistics of DataFrame
#     columns.
# DataFrame.memory_usage: Memory usage of DataFrame columns.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
# >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
# ...                   "float_col": float_values})
# >>> df
#     int_col text_col  float_col
# 0        1    alpha       0.00
# 1        2     beta       0.25
# 2        3    gamma       0.50
# 3        4    delta       0.75
# 4        5  epsilon       1.00
#
# Prints information of all columns:
#
# >>> df.info(verbose=True)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Data columns (total 3 columns):
#  #   Column     Non-Null Count  Dtype
# ---  ------     --------------  -----
#  0   int_col    5 non-null      int64
#  1   text_col   5 non-null      object
#  2   float_col  5 non-null      float64
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Prints a summary of columns count and its dtypes but not per column
# information:
#
# >>> df.info(verbose=False)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Columns: 3 entries, int_col to float_col
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Pipe output of DataFrame.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> df.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big DataFrames and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> df = pd.DataFrame({
# ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
# ... })
# >>> df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 22.9+ MB
#
# >>> df.info(memory_usage='deep')
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 165.9 MB
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.isnull</u></summary>
# <blockquote>
# <code>
# DataFrame.isnull is an alias for DataFrame.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# DataFrame
#     Mask of bool values for each element in DataFrame that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# DataFrame.isnull : Alias of isna.
# DataFrame.notna : Boolean inverse of isna.
# DataFrame.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations</u></summary>
# <blockquote>
# <code>
# Add the operations to the cls; evaluate the doc strings again
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.mean</u></summary>
# <blockquote>
# <code>
# Return the mean of the values over the requested axis.
#
# Parameters
# ----------
# axis : {index (0)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a scalar.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# scalar or Series (if level specified)
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u></summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.to_csv</u></summary>
# <blockquote>
# <code>
# Write object to a comma-separated values (csv) file.
#
# Parameters
# ----------
# path_or_buf : str, path object, file-like object, or None, default None
#     String, path object (implementing os.PathLike[str]), or file-like
#     object implementing a write() function. If None, the result is
#     returned as a string. If a non-binary file object is passed, it should
#     be opened with `newline=''`, disabling universal newlines. If a binary
#     file object is passed, `mode` might need to contain a `'b'`.
#
#     .. versionchanged:: 1.2.0
#
#        Support for binary file objects was introduced.
#
# sep : str, default ','
#     String of length 1. Field delimiter for the output file.
# na_rep : str, default ''
#     Missing data representation.
# float_format : str, default None
#     Format string for floating point numbers.
# columns : sequence, optional
#     Columns to write.
# header : bool or list of str, default True
#     Write out the column names. If a list of strings is given it is
#     assumed to be aliases for the column names.
# index : bool, default True
#     Write row names (index).
# index_label : str or sequence, or False, default None
#     Column label for index column(s) if desired. If None is given, and
#     `header` and `index` are True, then the index names are used. A
#     sequence should be given if the object uses MultiIndex. If
#     False do not print fields for index names. Use index_label=False
#     for easier importing in R.
# mode : str
#     Python write mode, default 'w'.
# encoding : str, optional
#     A string representing the encoding to use in the output file,
#     defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
#     is a non-binary file object.
# compression : str or dict, default 'infer'
#     For on-the-fly compression of the output data. If 'infer' and '%s'
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to
#     ``None`` for no compression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for faster compression and to create
#     a reproducible gzip archive:
#     ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
#
#     .. versionchanged:: 1.0.0
#
#        May now be a dict with key 'method' as compression mode
#        and other entries as additional compression options if
#        compression mode is 'zip'.
#
#     .. versionchanged:: 1.1.0
#
#        Passing compression options as keys in dict is
#        supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
#
#     .. versionchanged:: 1.2.0
#
#         Compression is supported for binary file objects.
#
#     .. versionchanged:: 1.2.0
#
#         Previous versions forwarded dict entries for 'gzip' to
#         `gzip.open` instead of `gzip.GzipFile` which prevented
#         setting `mtime`.
#
# quoting : optional constant from csv module
#     Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
#     then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
#     will treat them as non-numeric.
# quotechar : str, default '\"'
#     String of length 1. Character used to quote fields.
# line_terminator : str, optional
#     The newline character or character sequence to use in the output
#     file. Defaults to `os.linesep`, which depends on the OS in which
#     this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
# chunksize : int or None
#     Rows to write at a time.
# date_format : str, default None
#     Format string for datetime objects.
# doublequote : bool, default True
#     Control quoting of `quotechar` inside a field.
# escapechar : str, default None
#     String of length 1. Character used to escape `sep` and `quotechar`
#     when appropriate.
# decimal : str, default '.'
#     Character recognized as decimal separator. E.g. use ',' for
#     European data.
# errors : str, default 'strict'
#     Specifies how encoding and decoding errors are to be handled.
#     See the errors argument for :func:`open` for a full list
#     of options.
#
#     .. versionadded:: 1.1.0
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2.0
#
# Returns
# -------
# None or str
#     If path_or_buf is None, returns the resulting csv format as a
#     string. Otherwise returns None.
#
# See Also
# --------
# read_csv : Load a CSV file into a DataFrame.
# to_excel : Write DataFrame to an Excel file.
#
# Examples
# --------
# >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
# ...                    'mask': ['red', 'purple'],
# ...                    'weapon': ['sai', 'bo staff']})
# >>> df.to_csv(index=False)
# 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
#
# Create 'out.zip' containing 'out.csv'
#
# >>> compression_opts = dict(method='zip',
# ...                         archive_name='out.csv')  # doctest: +SKIP
# >>> df.to_csv('out.zip', index=False,
# ...           compression=compression_opts)  # doctest: +SKIP
#
# To write a csv file to a new folder or nested folder you will first
# need to create it using either Pathlib or os:
#
# >>> from pathlib import Path  # doctest: +SKIP
# >>> filepath = Path('folder/subfolder/out.csv')  # doctest: +SKIP
# >>> filepath.parent.mkdir(parents=True, exist_ok=True)  # doctest: +SKIP
# >>> df.to_csv(filepath)  # doctest: +SKIP
#
# >>> import os  # doctest: +SKIP
# >>> os.makedirs('folder/subfolder', exist_ok=True)  # doctest: +SKIP
# >>> df.to_csv('folder/subfolder/out.csv')  # doctest: +SKIP
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.groupby.generic.DataFrameGroupBy</u></summary>
# <blockquote>
# <code>
# Class for grouping and aggregating relational data.
#
# See aggregate, transform, and apply functions on this object.
#
# It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
#
# ::
#
#     grouped = groupby(obj, ...)
#
# Parameters
# ----------
# obj : pandas object
# axis : int, default 0
# level : int, default None
#     Level of MultiIndex
# groupings : list of Grouping objects
#     Most users should ignore this
# exclusions : array-like, optional
#     List of columns to exclude
# name : str
#     Most users should ignore this
#
# Returns
# -------
# **Attributes**
# groups : dict
#     {group name -> group labels}
# len(grouped) : int
#     Number of groups
#
# Notes
# -----
# After grouping, see aggregate, apply, and transform functions. Here are
# some other brief notes about usage. When grouping by multiple groups, the
# result index will be a MultiIndex (hierarchical) by default.
#
# Iteration produces (key, group) tuples, i.e. chunking the data by group. So
# you can write code like:
#
# ::
#
#     grouped = obj.groupby(keys, axis=axis)
#     for key, group in grouped:
#         # do something with the data
#
# Function calls on GroupBy, if not specially implemented, "dispatch" to the
# grouped data. So if you group a DataFrame and wish to invoke the std()
# method on each group, you can simply do:
#
# ::
#
#     df.groupby(mapper).std()
#
# rather than
#
# ::
#
#     df.groupby(mapper).aggregate(np.std)
#
# You can pass arguments to these "wrapped" functions, too.
#
# See the online documentation for full exposition on these topics and much
# more
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.groupby.generic.SeriesGroupBy</u></summary>
# <blockquote>
# <code>
# Class for grouping and aggregating relational data.
#
# See aggregate, transform, and apply functions on this object.
#
# It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
#
# ::
#
#     grouped = groupby(obj, ...)
#
# Parameters
# ----------
# obj : pandas object
# axis : int, default 0
# level : int, default None
#     Level of MultiIndex
# groupings : list of Grouping objects
#     Most users should ignore this
# exclusions : array-like, optional
#     List of columns to exclude
# name : str
#     Most users should ignore this
#
# Returns
# -------
# **Attributes**
# groups : dict
#     {group name -> group labels}
# len(grouped) : int
#     Number of groups
#
# Notes
# -----
# After grouping, see aggregate, apply, and transform functions. Here are
# some other brief notes about usage. When grouping by multiple groups, the
# result index will be a MultiIndex (hierarchical) by default.
#
# Iteration produces (key, group) tuples, i.e. chunking the data by group. So
# you can write code like:
#
# ::
#
#     grouped = obj.groupby(keys, axis=axis)
#     for key, group in grouped:
#         # do something with the data
#
# Function calls on GroupBy, if not specially implemented, "dispatch" to the
# grouped data. So if you group a DataFrame and wish to invoke the std()
# method on each group, you can simply do:
#
# ::
#
#     df.groupby(mapper).std()
#
# rather than
#
# ::
#
#     df.groupby(mapper).aggregate(np.std)
#
# You can pass arguments to these "wrapped" functions, too.
#
# See the online documentation for full exposition on these topics and much
# more
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u></summary>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series</u></summary>
# <blockquote>
# <code>
# One-dimensional ndarray with axis labels (including time series).
#
# Labels need not be unique but must be a hashable type. The object
# supports both integer- and label-based indexing and provides a host of
# methods for performing operations involving the index. Statistical
# methods from ndarray have been overridden to automatically exclude
# missing data (currently represented as NaN).
#
# Operations between Series (+, -, /, \*, \*\*) align values based on their
# associated index values-- they need not be the same length. The result
# index will be the sorted union of the two indexes.
#
# Parameters
# ----------
# data : array-like, Iterable, dict, or scalar value
#     Contains data stored in Series. If data is a dict, argument order is
#     maintained.
# index : array-like or Index (1d)
#     Values must be hashable and have the same length as `data`.
#     Non-unique index values are allowed. Will default to
#     RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like
#     and index is None, then the keys in the data are used as the index. If the
#     index is not None, the resulting Series is reindexed with the index values.
# dtype : str, numpy.dtype, or ExtensionDtype, optional
#     Data type for the output Series. If not specified, this will be
#     inferred from `data`.
#     See the :ref:`user guide <basics.dtypes>` for more usages.
# name : str, optional
#     The name to give to the Series.
# copy : bool, default False
#     Copy input data. Only affects Series or 1d ndarray input. See examples.
#
# Examples
# --------
# Constructing Series from a dictionary with an Index specified
#
# >>> d = {'a': 1, 'b': 2, 'c': 3}
# >>> ser = pd.Series(data=d, index=['a', 'b', 'c'])
# >>> ser
# a   1
# b   2
# c   3
# dtype: int64
#
# The keys of the dictionary match with the Index values, hence the Index
# values have no effect.
#
# >>> d = {'a': 1, 'b': 2, 'c': 3}
# >>> ser = pd.Series(data=d, index=['x', 'y', 'z'])
# >>> ser
# x   NaN
# y   NaN
# z   NaN
# dtype: float64
#
# Note that the Index is first build with the keys from the dictionary.
# After this the Series is reindexed with the given Index values, hence we
# get all NaN as a result.
#
# Constructing Series from a list with `copy=False`.
#
# >>> r = [1, 2]
# >>> ser = pd.Series(r, copy=False)
# >>> ser.iloc[0] = 999
# >>> r
# [1, 2]
# >>> ser
# 0    999
# 1      2
# dtype: int64
#
# Due to input data type the Series has a `copy` of
# the original data even though `copy=False`, so
# the data is unchanged.
#
# Constructing Series from a 1d ndarray with `copy=False`.
#
# >>> r = np.array([1, 2])
# >>> ser = pd.Series(r, copy=False)
# >>> ser.iloc[0] = 999
# >>> r
# array([999,   2])
# >>> ser
# 0    999
# 1      2
# dtype: int64
#
# Due to input data type the Series has a `view` on
# the original data, so
# the data is changed as well.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.apply</u></summary>
# <blockquote>
# <code>
# Invoke function on values of Series.
#
# Can be ufunc (a NumPy function that applies to the entire Series)
# or a Python function that only works on single values.
#
# Parameters
# ----------
# func : function
#     Python function or NumPy ufunc to apply.
# convert_dtype : bool, default True
#     Try to find better dtype for elementwise function results. If
#     False, leave as dtype=object. Note that the dtype is always
#     preserved for some extension array dtypes, such as Categorical.
# args : tuple
#     Positional arguments passed to func after the series value.
# **kwargs
#     Additional keyword arguments passed to func.
#
# Returns
# -------
# Series or DataFrame
#     If func returns a Series object the result will be a DataFrame.
#
# See Also
# --------
# Series.map: For element-wise operations.
# Series.agg: Only perform aggregating type operations.
# Series.transform: Only perform transforming type operations.
#
# Notes
# -----
# Functions that mutate the passed object can produce unexpected
# behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
# for more details.
#
# Examples
# --------
# Create a series with typical summer temperatures for each city.
#
# >>> s = pd.Series([20, 21, 12],
# ...               index=['London', 'New York', 'Helsinki'])
# >>> s
# London      20
# New York    21
# Helsinki    12
# dtype: int64
#
# Square the values by defining a function and passing it as an
# argument to ``apply()``.
#
# >>> def square(x):
# ...     return x ** 2
# >>> s.apply(square)
# London      400
# New York    441
# Helsinki    144
# dtype: int64
#
# Square the values by passing an anonymous function as an
# argument to ``apply()``.
#
# >>> s.apply(lambda x: x ** 2)
# London      400
# New York    441
# Helsinki    144
# dtype: int64
#
# Define a custom function that needs additional positional
# arguments and pass these additional arguments using the
# ``args`` keyword.
#
# >>> def subtract_custom_value(x, custom_value):
# ...     return x - custom_value
#
# >>> s.apply(subtract_custom_value, args=(5,))
# London      15
# New York    16
# Helsinki     7
# dtype: int64
#
# Define a custom function that takes keyword arguments
# and pass these arguments to ``apply``.
#
# >>> def add_custom_values(x, **kwargs):
# ...     for month in kwargs:
# ...         x += kwargs[month]
# ...     return x
#
# >>> s.apply(add_custom_values, june=30, july=20, august=25)
# London      95
# New York    96
# Helsinki    87
# dtype: int64
#
# Use a function from the Numpy library.
#
# >>> s.apply(np.log)
# London      2.995732
# New York    3.044522
# Helsinki    2.484907
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u></summary>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.fillna</u></summary>
# <blockquote>
# <code>
# Fill NA/NaN values using the specified method.
#
# Parameters
# ----------
# value : scalar, dict, Series, or DataFrame
#     Value to use to fill holes (e.g. 0), alternately a
#     dict/Series/DataFrame of values specifying which value to use for
#     each index (for a Series) or column (for a DataFrame).  Values not
#     in the dict/Series/DataFrame will not be filled. This value cannot
#     be a list.
# method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
#     Method to use for filling holes in reindexed Series
#     pad / ffill: propagate last valid observation forward to next valid
#     backfill / bfill: use next valid observation to fill gap.
# axis : {0 or 'index'}
#     Axis along which to fill missing values.
# inplace : bool, default False
#     If True, fill in-place. Note: this will modify any
#     other views on this object (e.g., a no-copy slice for a column in a
#     DataFrame).
# limit : int, default None
#     If method is specified, this is the maximum number of consecutive
#     NaN values to forward/backward fill. In other words, if there is
#     a gap with more than this number of consecutive NaNs, it will only
#     be partially filled. If method is not specified, this is the
#     maximum number of entries along the entire axis where NaNs will be
#     filled. Must be greater than 0 if not None.
# downcast : dict, default is None
#     A dict of item->dtype of what to downcast if possible,
#     or the string 'infer' which will try to downcast to an appropriate
#     equal type (e.g. float64 to int64 if possible).
#
# Returns
# -------
# Series or None
#     Object with missing values filled or None if ``inplace=True``.
#
# See Also
# --------
# interpolate : Fill NaN values using interpolation.
# reindex : Conform object to new index.
# asfreq : Convert TimeSeries to specified frequency.
#
# Examples
# --------
# >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
# ...                    [3, 4, np.nan, 1],
# ...                    [np.nan, np.nan, np.nan, np.nan],
# ...                    [np.nan, 3, np.nan, 4]],
# ...                   columns=list("ABCD"))
# >>> df
#      A    B   C    D
# 0  NaN  2.0 NaN  0.0
# 1  3.0  4.0 NaN  1.0
# 2  NaN  NaN NaN  NaN
# 3  NaN  3.0 NaN  4.0
#
# Replace all NaN elements with 0s.
#
# >>> df.fillna(0)
#      A    B    C    D
# 0  0.0  2.0  0.0  0.0
# 1  3.0  4.0  0.0  1.0
# 2  0.0  0.0  0.0  0.0
# 3  0.0  3.0  0.0  4.0
#
# We can also propagate non-null values forward or backward.
#
# >>> df.fillna(method="ffill")
#      A    B   C    D
# 0  NaN  2.0 NaN  0.0
# 1  3.0  4.0 NaN  1.0
# 2  3.0  4.0 NaN  1.0
# 3  3.0  3.0 NaN  4.0
#
# Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
# 2, and 3 respectively.
#
# >>> values = {"A": 0, "B": 1, "C": 2, "D": 3}
# >>> df.fillna(value=values)
#      A    B    C    D
# 0  0.0  2.0  2.0  0.0
# 1  3.0  4.0  2.0  1.0
# 2  0.0  1.0  2.0  3.0
# 3  0.0  3.0  2.0  4.0
#
# Only replace the first NaN element.
#
# >>> df.fillna(value=values, limit=1)
#      A    B    C    D
# 0  0.0  2.0  2.0  0.0
# 1  3.0  4.0  NaN  1.0
# 2  NaN  1.0  NaN  3.0
# 3  NaN  3.0  NaN  4.0
#
# When filling using a DataFrame, replacement happens along
# the same column names and same indices
#
# >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
# >>> df.fillna(df2)
#      A    B    C    D
# 0  0.0  2.0  0.0  0.0
# 1  3.0  4.0  0.0  1.0
# 2  0.0  0.0  0.0  NaN
# 3  0.0  3.0  0.0  4.0
#
# Note that column D is not affected since it is not present in df2.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.groupby</u></summary>
# <blockquote>
# <code>
# Group Series using a mapper or by a Series of columns.
#
# A groupby operation involves some combination of splitting the
# object, applying a function, and combining the results. This can be
# used to group large amounts of data and compute operations on these
# groups.
#
# Parameters
# ----------
# by : mapping, function, label, or list of labels
#     Used to determine the groups for the groupby.
#     If ``by`` is a function, it's called on each value of the object's
#     index. If a dict or Series is passed, the Series or dict VALUES
#     will be used to determine the groups (the Series' values are first
#     aligned; see ``.align()`` method). If a list or ndarray of length
#     equal to the selected axis is passed (see the `groupby user guide
#     <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
#     the values are used as-is to determine the groups. A label or list
#     of labels may be passed to group by the columns in ``self``.
#     Notice that a tuple is interpreted as a (single) key.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Split along rows (0) or columns (1).
# level : int, level name, or sequence of such, default None
#     If the axis is a MultiIndex (hierarchical), group by a particular
#     level or levels.
# as_index : bool, default True
#     For aggregated output, return object with group labels as the
#     index. Only relevant for DataFrame input. as_index=False is
#     effectively "SQL-style" grouped output.
# sort : bool, default True
#     Sort group keys. Get better performance by turning this off.
#     Note this does not influence the order of observations within each
#     group. Groupby preserves the order of rows within each group.
# group_keys : bool, default True
#     When calling apply, add group keys to index to identify pieces.
# squeeze : bool, default False
#     Reduce the dimensionality of the return type if possible,
#     otherwise return a consistent type.
#
#     .. deprecated:: 1.1.0
#
# observed : bool, default False
#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.
# dropna : bool, default True
#     If True, and if group keys contain NA values, NA values together
#     with row/column will be dropped.
#     If False, NA values will also be treated as the key in groups.
#
#     .. versionadded:: 1.1.0
#
# Returns
# -------
# SeriesGroupBy
#     Returns a groupby object that contains information about the groups.
#
# See Also
# --------
# resample : Convenience method for frequency conversion and resampling
#     of time series.
#
# Notes
# -----
# See the `user guide
# <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
# detailed usage and examples, including splitting an object into groups,
# iterating through groups, selecting a group, aggregation, and more.
#
# Examples
# --------
# >>> ser = pd.Series([390., 350., 30., 20.],
# ...                 index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed")
# >>> ser
# Falcon    390.0
# Falcon    350.0
# Parrot     30.0
# Parrot     20.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(["a", "b", "a", "b"]).mean()
# a    210.0
# b    185.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(level=0).mean()
# Falcon    370.0
# Parrot     25.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(ser > 100).mean()
# Max Speed
# False     25.0
# True     370.0
# Name: Max Speed, dtype: float64
#
# **Grouping by Indexes**
#
# We can groupby different levels of a hierarchical index
# using the `level` parameter:
#
# >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
# ...           ['Captive', 'Wild', 'Captive', 'Wild']]
# >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
# >>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed")
# >>> ser
# Animal  Type
# Falcon  Captive    390.0
#         Wild       350.0
# Parrot  Captive     30.0
#         Wild        20.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(level=0).mean()
# Animal
# Falcon    370.0
# Parrot     25.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(level="Type").mean()
# Type
# Captive    210.0
# Wild       185.0
# Name: Max Speed, dtype: float64
#
# We can also choose to include `NA` in group keys or not by defining
# `dropna` parameter, the default setting is `True`.
#
# >>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan])
# >>> ser.groupby(level=0).sum()
# a    3
# b    3
# dtype: int64
#
# >>> ser.groupby(level=0, dropna=False).sum()
# a    3
# b    3
# NaN  3
# dtype: int64
#
# >>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
# >>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
# >>> ser.groupby(["a", "b", "a", np.nan]).mean()
# a    210.0
# b    350.0
# Name: Max Speed, dtype: float64
#
# >>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()
# a    210.0
# b    350.0
# NaN   20.0
# Name: Max Speed, dtype: float64
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.info</u></summary>
# <blockquote>
# <code>
# Print a concise summary of a Series.
#
# This method prints information about a Series including
# the index dtype, non-null values and memory usage.
#
# .. versionadded:: 1.4.0
#
# Parameters
# ----------
# data : Series
#     Series to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the Series
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
#
# Returns
# -------
# None
#     This method prints a summary of a Series and returns None.
#
# See Also
# --------
# Series.describe: Generate descriptive statistics of Series.
# Series.memory_usage: Memory usage of Series.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> s = pd.Series(text_values, index=int_values)
# >>> s.info()
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# Series name: None
# Non-Null Count  Dtype
# --------------  -----
# 5 non-null      object
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Prints a summary excluding information about its values:
#
# >>> s.info(verbose=False)
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Pipe output of Series.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> s.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big Series and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
# >>> s.info()
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 7.6+ MB
#
# >>> s.info(memory_usage='deep')
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 55.3 MB
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.isin</u></summary>
# <blockquote>
# <code>
# Whether elements in Series are contained in `values`.
#
# Return a boolean Series showing whether each element in the Series
# matches an element in the passed sequence of `values` exactly.
#
# Parameters
# ----------
# values : set or list-like
#     The sequence of values to test. Passing in a single string will
#     raise a ``TypeError``. Instead, turn a single string into a
#     list of one element.
#
# Returns
# -------
# Series
#     Series of booleans indicating if each element is in values.
#
# Raises
# ------
# TypeError
#   * If `values` is a string
#
# See Also
# --------
# DataFrame.isin : Equivalent method on DataFrame.
#
# Examples
# --------
# >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
# ...                'hippo'], name='animal')
# >>> s.isin(['cow', 'lama'])
# 0     True
# 1     True
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# To invert the boolean values, use the ``~`` operator:
#
# >>> ~s.isin(['cow', 'lama'])
# 0    False
# 1    False
# 2    False
# 3     True
# 4    False
# 5     True
# Name: animal, dtype: bool
#
# Passing a single string as ``s.isin('lama')`` will raise an error. Use
# a list of one element instead:
#
# >>> s.isin(['lama'])
# 0     True
# 1    False
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# Strings and integers are distinct and are therefore not comparable:
#
# >>> pd.Series([1]).isin(['1'])
# 0    False
# dtype: bool
# >>> pd.Series([1.1]).isin(['1.1'])
# 0    False
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.isnull</u></summary>
# <blockquote>
# <code>
# Series.isnull is an alias for Series.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# Series.isnull : Alias of isna.
# Series.notna : Boolean inverse of isna.
# Series.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u></summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.notnull</u></summary>
# <blockquote>
# <code>
# Series.notnull is an alias for Series.notna.
#
# Detect existing (non-missing) values.
#
# Return a boolean same-sized object indicating if the values are not NA.
# Non-missing values get mapped to True. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
# NA values, such as None or :attr:`numpy.NaN`, get mapped to False
# values.
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is not an NA value.
#
# See Also
# --------
# Series.notnull : Alias of notna.
# Series.isna : Boolean inverse of notna.
# Series.dropna : Omit axes labels with missing values.
# notna : Top-level notna.
#
# Examples
# --------
# Show which entries in a DataFrame are not NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.notna()
#      age   born  name    toy
# 0   True  False  True  False
# 1   True   True  True   True
# 2  False   True  True   True
#
# Show which entries in a Series are not NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.notna()
# 0     True
# 1     True
# 2    False
# dtype: bool
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.io.parsers.readers.read_csv</u></summary>
# <blockquote>
# <code>
# Read a comma-separated values (csv) file into DataFrame.
#
# Also supports optionally iterating or breaking of the file
# into chunks.
#
# Additional help can be found in the online docs for
# `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
#
# Parameters
# ----------
# filepath_or_buffer : str, path object or file-like object
#     Any valid string path is acceptable. The string could be a URL. Valid
#     URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
#     expected. A local file could be: file://localhost/path/to/table.csv.
#
#     If you want to pass in a path object, pandas accepts any ``os.PathLike``.
#
#     By file-like object, we refer to objects with a ``read()`` method, such as
#     a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
# sep : str, default ','
#     Delimiter to use. If sep is None, the C engine cannot automatically detect
#     the separator, but the Python parsing engine can, meaning the latter will
#     be used and automatically detect the separator by Python's builtin sniffer
#     tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
#     different from ``'\s+'`` will be interpreted as regular expressions and
#     will also force the use of the Python parsing engine. Note that regex
#     delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
# delimiter : str, default ``None``
#     Alias for sep.
# header : int, list of int, None, default 'infer'
#     Row number(s) to use as the column names, and the start of the
#     data.  Default behavior is to infer the column names: if no names
#     are passed the behavior is identical to ``header=0`` and column
#     names are inferred from the first line of the file, if column
#     names are passed explicitly then the behavior is identical to
#     ``header=None``. Explicitly pass ``header=0`` to be able to
#     replace existing names. The header can be a list of integers that
#     specify row locations for a multi-index on the columns
#     e.g. [0,1,3]. Intervening rows that are not specified will be
#     skipped (e.g. 2 in this example is skipped). Note that this
#     parameter ignores commented lines and empty lines if
#     ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
#     data rather than the first line of the file.
# names : array-like, optional
#     List of column names to use. If the file contains a header row,
#     then you should explicitly pass ``header=0`` to override the column names.
#     Duplicates in this list are not allowed.
# index_col : int, str, sequence of int / str, or False, optional, default ``None``
#   Column(s) to use as the row labels of the ``DataFrame``, either given as
#   string name or column index. If a sequence of int / str is given, a
#   MultiIndex is used.
#
#   Note: ``index_col=False`` can be used to force pandas to *not* use the first
#   column as the index, e.g. when you have a malformed file with delimiters at
#   the end of each line.
# usecols : list-like or callable, optional
#     Return a subset of the columns. If list-like, all elements must either
#     be positional (i.e. integer indices into the document columns) or strings
#     that correspond to column names provided either by the user in `names` or
#     inferred from the document header row(s). If ``names`` are given, the document
#     header row(s) are not taken into account. For example, a valid list-like
#     `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
#     Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
#     To instantiate a DataFrame from ``data`` with element order preserved use
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
#     in ``['foo', 'bar']`` order or
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
#     for ``['bar', 'foo']`` order.
#
#     If callable, the callable function will be evaluated against the column
#     names, returning names where the callable function evaluates to True. An
#     example of a valid callable argument would be ``lambda x: x.upper() in
#     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
#     parsing time and lower memory usage.
# squeeze : bool, default False
#     If the parsed data only contains one column then return a Series.
#
#     .. deprecated:: 1.4.0
#         Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
#         the data.
# prefix : str, optional
#     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
#
#     .. deprecated:: 1.4.0
#        Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
# mangle_dupe_cols : bool, default True
#     Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
#     'X'...'X'. Passing in False will cause data to be overwritten if there
#     are duplicate names in the columns.
# dtype : Type name or dict of column -> type, optional
#     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
#     'c': 'Int64'}
#     Use `str` or `object` together with suitable `na_values` settings
#     to preserve and not interpret dtype.
#     If converters are specified, they will be applied INSTEAD
#     of dtype conversion.
# engine : {'c', 'python', 'pyarrow'}, optional
#     Parser engine to use. The C and pyarrow engines are faster, while the python engine
#     is currently more feature-complete. Multithreading is currently only supported by
#     the pyarrow engine.
#
#     .. versionadded:: 1.4.0
#
#         The "pyarrow" engine was added as an *experimental* engine, and some features
#         are unsupported, or may not work correctly, with this engine.
# converters : dict, optional
#     Dict of functions for converting values in certain columns. Keys can either
#     be integers or column labels.
# true_values : list, optional
#     Values to consider as True.
# false_values : list, optional
#     Values to consider as False.
# skipinitialspace : bool, default False
#     Skip spaces after delimiter.
# skiprows : list-like, int or callable, optional
#     Line numbers to skip (0-indexed) or number of lines to skip (int)
#     at the start of the file.
#
#     If callable, the callable function will be evaluated against the row
#     indices, returning True if the row should be skipped and False otherwise.
#     An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
# skipfooter : int, default 0
#     Number of lines at bottom of file to skip (Unsupported with engine='c').
# nrows : int, optional
#     Number of rows of file to read. Useful for reading pieces of large files.
# na_values : scalar, str, list-like, or dict, optional
#     Additional strings to recognize as NA/NaN. If dict passed, specific
#     per-column NA values.  By default the following values are interpreted as
#     NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
#     '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
#     'nan', 'null'.
# keep_default_na : bool, default True
#     Whether or not to include the default NaN values when parsing the data.
#     Depending on whether `na_values` is passed in, the behavior is as follows:
#
#     * If `keep_default_na` is True, and `na_values` are specified, `na_values`
#       is appended to the default NaN values used for parsing.
#     * If `keep_default_na` is True, and `na_values` are not specified, only
#       the default NaN values are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are specified, only
#       the NaN values specified `na_values` are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are not specified, no
#       strings will be parsed as NaN.
#
#     Note that if `na_filter` is passed in as False, the `keep_default_na` and
#     `na_values` parameters will be ignored.
# na_filter : bool, default True
#     Detect missing value markers (empty strings and the value of na_values). In
#     data without any NAs, passing na_filter=False can improve the performance
#     of reading a large file.
# verbose : bool, default False
#     Indicate number of NA values placed in non-numeric columns.
# skip_blank_lines : bool, default True
#     If True, skip over blank lines rather than interpreting as NaN values.
# parse_dates : bool or list of int or names or list of lists or dict, default False
#     The behavior is as follows:
#
#     * boolean. If True -> try parsing the index.
#     * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
#       each as a separate date column.
#     * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
#       a single date column.
#     * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
#       result 'foo'
#
#     If a column or index cannot be represented as an array of datetimes,
#     say because of an unparsable value or a mixture of timezones, the column
#     or index will be returned unaltered as an object data type. For
#     non-standard datetime parsing, use ``pd.to_datetime`` after
#     ``pd.read_csv``. To parse an index or column with a mixture of timezones,
#     specify ``date_parser`` to be a partially-applied
#     :func:`pandas.to_datetime` with ``utc=True``. See
#     :ref:`io.csv.mixed_timezones` for more.
#
#     Note: A fast-path exists for iso8601-formatted dates.
# infer_datetime_format : bool, default False
#     If True and `parse_dates` is enabled, pandas will attempt to infer the
#     format of the datetime strings in the columns, and if it can be inferred,
#     switch to a faster method of parsing them. In some cases this can increase
#     the parsing speed by 5-10x.
# keep_date_col : bool, default False
#     If True and `parse_dates` specifies combining multiple columns then
#     keep the original columns.
# date_parser : function, optional
#     Function to use for converting a sequence of string columns to an array of
#     datetime instances. The default uses ``dateutil.parser.parser`` to do the
#     conversion. Pandas will try to call `date_parser` in three different ways,
#     advancing to the next if an exception occurs: 1) Pass one or more arrays
#     (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
#     string values from the columns defined by `parse_dates` into a single array
#     and pass that; and 3) call `date_parser` once for each row using one or
#     more strings (corresponding to the columns defined by `parse_dates`) as
#     arguments.
# dayfirst : bool, default False
#     DD/MM format dates, international and European format.
# cache_dates : bool, default True
#     If True, use a cache of unique, converted dates to apply the datetime
#     conversion. May produce significant speed-up when parsing duplicate
#     date strings, especially ones with timezone offsets.
#
#     .. versionadded:: 0.25.0
# iterator : bool, default False
#     Return TextFileReader object for iteration or getting chunks with
#     ``get_chunk()``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# chunksize : int, optional
#     Return TextFileReader object for iteration.
#     See the `IO Tools docs
#     <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
#     for more information on ``iterator`` and ``chunksize``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# compression : str or dict, default 'infer'
#     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
#     'zip', the ZIP file must contain only one data file to be read in. Set to
#     ``None`` for no decompression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for Zstandard decompression using a
#     custom compression dictionary:
#     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
#
#     .. versionchanged:: 1.4.0 Zstandard support.
#
# thousands : str, optional
#     Thousands separator.
# decimal : str, default '.'
#     Character to recognize as decimal point (e.g. use ',' for European data).
# lineterminator : str (length 1), optional
#     Character to break file into lines. Only valid with C parser.
# quotechar : str (length 1), optional
#     The character used to denote the start and end of a quoted item. Quoted
#     items can include the delimiter and it will be ignored.
# quoting : int or csv.QUOTE_* instance, default 0
#     Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
#     QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
# doublequote : bool, default ``True``
#    When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
#    whether or not to interpret two consecutive quotechar elements INSIDE a
#    field as a single ``quotechar`` element.
# escapechar : str (length 1), optional
#     One-character string used to escape other characters.
# comment : str, optional
#     Indicates remainder of line should not be parsed. If found at the beginning
#     of a line, the line will be ignored altogether. This parameter must be a
#     single character. Like empty lines (as long as ``skip_blank_lines=True``),
#     fully commented lines are ignored by the parameter `header` but not by
#     `skiprows`. For example, if ``comment='#'``, parsing
#     ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
#     treated as the header.
# encoding : str, optional
#     Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
#     standard encodings
#     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
#
#     .. versionchanged:: 1.2
#
#        When ``encoding`` is ``None``, ``errors="replace"`` is passed to
#        ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
#        This behavior was previously only the case for ``engine="python"``.
#
#     .. versionchanged:: 1.3.0
#
#        ``encoding_errors`` is a new argument. ``encoding`` has no longer an
#        influence on how encoding errors are handled.
#
# encoding_errors : str, optional, default "strict"
#     How encoding errors are treated. `List of possible values
#     <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
#
#     .. versionadded:: 1.3.0
#
# dialect : str or csv.Dialect, optional
#     If provided, this parameter will override values (default or not) for the
#     following parameters: `delimiter`, `doublequote`, `escapechar`,
#     `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
#     override values, a ParserWarning will be issued. See csv.Dialect
#     documentation for more details.
# error_bad_lines : bool, optional, default ``None``
#     Lines with too many fields (e.g. a csv line with too many commas) will by
#     default cause an exception to be raised, and no DataFrame will be returned.
#     If False, then these "bad lines" will be dropped from the DataFrame that is
#     returned.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# warn_bad_lines : bool, optional, default ``None``
#     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
#     "bad line" will be output.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
#     Specifies what to do upon encountering a bad line (a line with too many fields).
#     Allowed values are :
#
#         - 'error', raise an Exception when a bad line is encountered.
#         - 'warn', raise a warning when a bad line is encountered and skip that line.
#         - 'skip', skip bad lines without raising or warning when they are encountered.
#
#     .. versionadded:: 1.3.0
#
#         - callable, function with signature
#           ``(bad_line: list[str]) -> list[str] | None`` that will process a single
#           bad line. ``bad_line`` is a list of strings split by the ``sep``.
#           If the function returns ``None``, the bad line will be ignored.
#           If the function returns a new list of strings with more elements than
#           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
#           Only supported when ``engine="python"``
#
#     .. versionadded:: 1.4.0
#
# delim_whitespace : bool, default False
#     Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
#     used as the sep. Equivalent to setting ``sep='\s+'``. If this option
#     is set to True, nothing should be passed in for the ``delimiter``
#     parameter.
# low_memory : bool, default True
#     Internally process the file in chunks, resulting in lower memory use
#     while parsing, but possibly mixed type inference.  To ensure no mixed
#     types either set False, or specify the type with the `dtype` parameter.
#     Note that the entire file is read into a single DataFrame regardless,
#     use the `chunksize` or `iterator` parameter to return the data in chunks.
#     (Only valid with C parser).
# memory_map : bool, default False
#     If a filepath is provided for `filepath_or_buffer`, map the file object
#     directly onto memory and access the data directly from there. Using this
#     option can improve performance because there is no longer any I/O overhead.
# float_precision : str, optional
#     Specifies which converter the C engine should use for floating-point
#     values. The options are ``None`` or 'high' for the ordinary converter,
#     'legacy' for the original lower precision pandas converter, and
#     'round_trip' for the round-trip converter.
#
#     .. versionchanged:: 1.2
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2
#
# Returns
# -------
# DataFrame or TextParser
#     A comma-separated values (csv) file is returned as two-dimensional
#     data structure with labeled axes.
#
# See Also
# --------
# DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_fwf : Read a table of fixed-width formatted lines into DataFrame.
#
# Examples
# --------
# >>> pd.read_csv('data.csv')  # doctest: +SKIP
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>seaborn</b>
# <ul>
# <li>
# <details><summary><u>seaborn.distributions.histplot</u></summary>
# <blockquote>
# <code>
# Plot univariate or bivariate histograms to show distributions of datasets.
#
# A histogram is a classic visualization tool that represents the distribution
# of one or more variables by counting the number of observations that fall within
# disrete bins.
#
# This function can normalize the statistic computed within each bin to estimate
# frequency, density or probability mass, and it can add a smooth curve obtained
# using a kernel density estimate, similar to :func:`kdeplot`.
#
# More information is provided in the :ref:`user guide <tutorial_hist>`.
#
# Parameters
# ----------
# data : :class:`pandas.DataFrame`, :class:`numpy.ndarray`, mapping, or sequence
#     Input data structure. Either a long-form collection of vectors that can be
#     assigned to named variables or a wide-form dataset that will be internally
#     reshaped.
# x, y : vectors or keys in ``data``
#     Variables that specify positions on the x and y axes.
# hue : vector or key in ``data``
#     Semantic variable that is mapped to determine the color of plot elements.
# weights : vector or key in ``data``
#     If provided, weight the contribution of the corresponding data points
#     towards the count in each bin by these factors.
# stat : str
#     Aggregate statistic to compute in each bin.
#     
#     - `count`: show the number of observations in each bin
#     - `frequency`: show the number of observations divided by the bin width
#     - `probability`: or `proportion`: normalize such that bar heights sum to 1
#     - `percent`: normalize such that bar heights sum to 100
#     - `density`: normalize such that the total area of the histogram equals 1
# bins : str, number, vector, or a pair of such values
#     Generic bin parameter that can be the name of a reference rule,
#     the number of bins, or the breaks of the bins.
#     Passed to :func:`numpy.histogram_bin_edges`.
# binwidth : number or pair of numbers
#     Width of each bin, overrides ``bins`` but can be used with
#     ``binrange``.
# binrange : pair of numbers or a pair of pairs
#     Lowest and highest value for bin edges; can be used either
#     with ``bins`` or ``binwidth``. Defaults to data extremes.
# discrete : bool
#     If True, default to ``binwidth=1`` and draw the bars so that they are
#     centered on their corresponding data points. This avoids "gaps" that may
#     otherwise appear when using discrete (integer) data.
# cumulative : bool
#     If True, plot the cumulative counts as bins increase.
# common_bins : bool
#     If True, use the same bins when semantic variables produce multiple
#     plots. If using a reference rule to determine the bins, it will be computed
#     with the full dataset.
# common_norm : bool
#     If True and using a normalized statistic, the normalization will apply over
#     the full dataset. Otherwise, normalize each histogram independently.
# multiple : {"layer", "dodge", "stack", "fill"}
#     Approach to resolving multiple elements when semantic mapping creates subsets.
#     Only relevant with univariate data.
# element : {"bars", "step", "poly"}
#     Visual representation of the histogram statistic.
#     Only relevant with univariate data.
# fill : bool
#     If True, fill in the space under the histogram.
#     Only relevant with univariate data.
# shrink : number
#     Scale the width of each bar relative to the binwidth by this factor.
#     Only relevant with univariate data.
# kde : bool
#     If True, compute a kernel density estimate to smooth the distribution
#     and show on the plot as (one or more) line(s).
#     Only relevant with univariate data.
# kde_kws : dict
#     Parameters that control the KDE computation, as in :func:`kdeplot`.
# line_kws : dict
#     Parameters that control the KDE visualization, passed to
#     :meth:`matplotlib.axes.Axes.plot`.
# thresh : number or None
#     Cells with a statistic less than or equal to this value will be transparent.
#     Only relevant with bivariate data.
# pthresh : number or None
#     Like ``thresh``, but a value in [0, 1] such that cells with aggregate counts
#     (or other statistics, when used) up to this proportion of the total will be
#     transparent.
# pmax : number or None
#     A value in [0, 1] that sets that saturation point for the colormap at a value
#     such that cells below is constistute this proportion of the total count (or
#     other statistic, when used).
# cbar : bool
#     If True, add a colorbar to annotate the color mapping in a bivariate plot.
#     Note: Does not currently support plots with a ``hue`` variable well.
# cbar_ax : :class:`matplotlib.axes.Axes`
#     Pre-existing axes for the colorbar.
# cbar_kws : dict
#     Additional parameters passed to :meth:`matplotlib.figure.Figure.colorbar`.
# palette : string, list, dict, or :class:`matplotlib.colors.Colormap`
#     Method for choosing the colors to use when mapping the ``hue`` semantic.
#     String values are passed to :func:`color_palette`. List or dict values
#     imply categorical mapping, while a colormap object implies numeric mapping.
# hue_order : vector of strings
#     Specify the order of processing and plotting for categorical levels of the
#     ``hue`` semantic.
# hue_norm : tuple or :class:`matplotlib.colors.Normalize`
#     Either a pair of values that set the normalization range in data units
#     or an object that will map from data units into a [0, 1] interval. Usage
#     implies numeric mapping.
# color : :mod:`matplotlib color <matplotlib.colors>`
#     Single color specification for when hue mapping is not used. Otherwise, the
#     plot will try to hook into the matplotlib property cycle.
# log_scale : bool or number, or pair of bools or numbers
#     Set axis scale(s) to log. A single value sets the data axis for univariate
#     distributions and both axes for bivariate distributions. A pair of values
#     sets each axis independently. Numeric values are interpreted as the desired
#     base (default 10). If `False`, defer to the existing Axes scale.
# legend : bool
#     If False, suppress the legend for semantic variables.
# ax : :class:`matplotlib.axes.Axes`
#     Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca`
#     internally.
# kwargs
#     Other keyword arguments are passed to one of the following matplotlib
#     functions:
#
#     - :meth:`matplotlib.axes.Axes.bar` (univariate, element="bars")
#     - :meth:`matplotlib.axes.Axes.fill_between` (univariate, other element, fill=True)
#     - :meth:`matplotlib.axes.Axes.plot` (univariate, other element, fill=False)
#     - :meth:`matplotlib.axes.Axes.pcolormesh` (bivariate)
#
# Returns
# -------
# :class:`matplotlib.axes.Axes`
#     The matplotlib axes containing the plot.
#
# See Also
# --------
# displot : Figure-level interface to distribution plot functions.
# kdeplot : Plot univariate or bivariate distributions using kernel density estimation.
# rugplot : Plot a tick at each observation value along the x and/or y axes.
# ecdfplot : Plot empirical cumulative distribution functions.
# jointplot : Draw a bivariate plot with univariate marginal distributions.
#
# Notes
# -----
#
# The choice of bins for computing and plotting a histogram can exert
# substantial influence on the insights that one is able to draw from the
# visualization. If the bins are too large, they may erase important features.
# On the other hand, bins that are too small may be dominated by random
# variability, obscuring the shape of the true underlying distribution. The
# default bin size is determined using a reference rule that depends on the
# sample size and variance. This works well in many cases, (i.e., with
# "well-behaved" data) but it fails in others. It is always a good to try
# different bin sizes to be sure that you are not missing something important.
# This function allows you to specify bins in several different ways, such as
# by setting the total number of bins to use, the width of each bin, or the
# specific locations where the bins should break.
#
# Examples
# --------
#
# .. include:: ../docstrings/histplot.rst
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <b>sklearn</b>
# <ul>
# <li>
# <details><summary><u>sklearn.base.TransformerMixin.fit_transform</u></summary>
# <blockquote>
# <code>
# Fit to data, then transform it.
#
# Fits transformer to `X` and `y` with optional parameters `fit_params`
# and returns a transformed version of `X`.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input samples.
#
# y :  array-like of shape (n_samples,) or (n_samples, n_outputs),                 default=None
#     Target values (None for unsupervised transformations).
#
# **fit_params : dict
#     Additional fit parameters.
#
# Returns
# -------
# X_new : ndarray array of shape (n_samples, n_features_new)
#     Transformed array.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.metrics._regression.mean_squared_error</u></summary>
# <blockquote>
# <code>
# Mean squared error regression loss.
#
# Read more in the :ref:`User Guide <mean_squared_error>`.
#
# Parameters
# ----------
# y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
#     Ground truth (correct) target values.
#
# y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
#     Estimated target values.
#
# sample_weight : array-like of shape (n_samples,), default=None
#     Sample weights.
#
# multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'
#     Defines aggregating of multiple output values.
#     Array-like value defines weights used to average errors.
#
#     'raw_values' :
#         Returns a full set of errors in case of multioutput input.
#
#     'uniform_average' :
#         Errors of all outputs are averaged with uniform weight.
#
# squared : bool, default=True
#     If True returns MSE value, if False returns RMSE value.
#
# Returns
# -------
# loss : float or ndarray of floats
#     A non-negative floating point value (the best value is 0.0), or an
#     array of floating point values, one for each individual target.
#
# Examples
# --------
# >>> from sklearn.metrics import mean_squared_error
# >>> y_true = [3, -0.5, 2, 7]
# >>> y_pred = [2.5, 0.0, 2, 8]
# >>> mean_squared_error(y_true, y_pred)
# 0.375
# >>> y_true = [3, -0.5, 2, 7]
# >>> y_pred = [2.5, 0.0, 2, 8]
# >>> mean_squared_error(y_true, y_pred, squared=False)
# 0.612...
# >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]
# >>> y_pred = [[0, 2],[-1, 2],[8, -5]]
# >>> mean_squared_error(y_true, y_pred)
# 0.708...
# >>> mean_squared_error(y_true, y_pred, squared=False)
# 0.822...
# >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')
# array([0.41666667, 1.        ])
# >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])
# 0.825...
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler</u></summary>
# <blockquote>
# <code>
# Transform features by scaling each feature to a given range.
#
# This estimator scales and translates each feature individually such
# that it is in the given range on the training set, e.g. between
# zero and one.
#
# The transformation is given by::
#
#     X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
#     X_scaled = X_std * (max - min) + min
#
# where min, max = feature_range.
#
# This transformation is often used as an alternative to zero mean,
# unit variance scaling.
#
# Read more in the :ref:`User Guide <preprocessing_scaler>`.
#
# Parameters
# ----------
# feature_range : tuple (min, max), default=(0, 1)
#     Desired range of transformed data.
#
# copy : bool, default=True
#     Set to False to perform inplace row normalization and avoid a
#     copy (if the input is already a numpy array).
#
# clip : bool, default=False
#     Set to True to clip transformed values of held-out data to
#     provided `feature range`.
#
#     .. versionadded:: 0.24
#
# Attributes
# ----------
# min_ : ndarray of shape (n_features,)
#     Per feature adjustment for minimum. Equivalent to
#     ``min - X.min(axis=0) * self.scale_``
#
# scale_ : ndarray of shape (n_features,)
#     Per feature relative scaling of the data. Equivalent to
#     ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
#
#     .. versionadded:: 0.17
#        *scale_* attribute.
#
# data_min_ : ndarray of shape (n_features,)
#     Per feature minimum seen in the data
#
#     .. versionadded:: 0.17
#        *data_min_*
#
# data_max_ : ndarray of shape (n_features,)
#     Per feature maximum seen in the data
#
#     .. versionadded:: 0.17
#        *data_max_*
#
# data_range_ : ndarray of shape (n_features,)
#     Per feature range ``(data_max_ - data_min_)`` seen in the data
#
#     .. versionadded:: 0.17
#        *data_range_*
#
# n_features_in_ : int
#     Number of features seen during :term:`fit`.
#
#     .. versionadded:: 0.24
#
# n_samples_seen_ : int
#     The number of samples processed by the estimator.
#     It will be reset on new calls to fit, but increments across
#     ``partial_fit`` calls.
#
# feature_names_in_ : ndarray of shape (`n_features_in_`,)
#     Names of features seen during :term:`fit`. Defined only when `X`
#     has feature names that are all strings.
#
#     .. versionadded:: 1.0
#
# See Also
# --------
# minmax_scale : Equivalent function without the estimator API.
#
# Notes
# -----
# NaNs are treated as missing values: disregarded in fit, and maintained in
# transform.
#
# For a comparison of the different scalers, transformers, and normalizers,
# see :ref:`examples/preprocessing/plot_all_scaling.py
# <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
#
# Examples
# --------
# >>> from sklearn.preprocessing import MinMaxScaler
# >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
# >>> scaler = MinMaxScaler()
# >>> print(scaler.fit(data))
# MinMaxScaler()
# >>> print(scaler.data_max_)
# [ 1. 18.]
# >>> print(scaler.transform(data))
# [[0.   0.  ]
#  [0.25 0.25]
#  [0.5  0.5 ]
#  [1.   1.  ]]
# >>> print(scaler.transform([[2, 2]]))
# [[1.5 0. ]]
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler.transform</u></summary>
# <blockquote>
# <code>
# Scale features of X according to feature_range.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input data that will be transformed.
#
# Returns
# -------
# Xt : ndarray of shape (n_samples, n_features)
#     Transformed data.
#
# </code>
# <a href='#top_phases'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>1. Library Loading</h1>  <a id='1'></a><small><a href='#top_phases'>back to top</a></small>

# %% _uuid="8f2839f25d086af736a60e9eeb907d3b93b6e0e5" _cell_guid="b1076dfc-b9ad-4769-8c92-a6c4dae69d19"
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>2. Data Preparation</h1>  <a id='2'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.io.parsers.readers.read_csv</u></summary>
# <blockquote>
# <code>
# Read a comma-separated values (csv) file into DataFrame.
#
# Also supports optionally iterating or breaking of the file
# into chunks.
#
# Additional help can be found in the online docs for
# `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
#
# Parameters
# ----------
# filepath_or_buffer : str, path object or file-like object
#     Any valid string path is acceptable. The string could be a URL. Valid
#     URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
#     expected. A local file could be: file://localhost/path/to/table.csv.
#
#     If you want to pass in a path object, pandas accepts any ``os.PathLike``.
#
#     By file-like object, we refer to objects with a ``read()`` method, such as
#     a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
# sep : str, default ','
#     Delimiter to use. If sep is None, the C engine cannot automatically detect
#     the separator, but the Python parsing engine can, meaning the latter will
#     be used and automatically detect the separator by Python's builtin sniffer
#     tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
#     different from ``'\s+'`` will be interpreted as regular expressions and
#     will also force the use of the Python parsing engine. Note that regex
#     delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
# delimiter : str, default ``None``
#     Alias for sep.
# header : int, list of int, None, default 'infer'
#     Row number(s) to use as the column names, and the start of the
#     data.  Default behavior is to infer the column names: if no names
#     are passed the behavior is identical to ``header=0`` and column
#     names are inferred from the first line of the file, if column
#     names are passed explicitly then the behavior is identical to
#     ``header=None``. Explicitly pass ``header=0`` to be able to
#     replace existing names. The header can be a list of integers that
#     specify row locations for a multi-index on the columns
#     e.g. [0,1,3]. Intervening rows that are not specified will be
#     skipped (e.g. 2 in this example is skipped). Note that this
#     parameter ignores commented lines and empty lines if
#     ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
#     data rather than the first line of the file.
# names : array-like, optional
#     List of column names to use. If the file contains a header row,
#     then you should explicitly pass ``header=0`` to override the column names.
#     Duplicates in this list are not allowed.
# index_col : int, str, sequence of int / str, or False, optional, default ``None``
#   Column(s) to use as the row labels of the ``DataFrame``, either given as
#   string name or column index. If a sequence of int / str is given, a
#   MultiIndex is used.
#
#   Note: ``index_col=False`` can be used to force pandas to *not* use the first
#   column as the index, e.g. when you have a malformed file with delimiters at
#   the end of each line.
# usecols : list-like or callable, optional
#     Return a subset of the columns. If list-like, all elements must either
#     be positional (i.e. integer indices into the document columns) or strings
#     that correspond to column names provided either by the user in `names` or
#     inferred from the document header row(s). If ``names`` are given, the document
#     header row(s) are not taken into account. For example, a valid list-like
#     `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
#     Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
#     To instantiate a DataFrame from ``data`` with element order preserved use
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
#     in ``['foo', 'bar']`` order or
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
#     for ``['bar', 'foo']`` order.
#
#     If callable, the callable function will be evaluated against the column
#     names, returning names where the callable function evaluates to True. An
#     example of a valid callable argument would be ``lambda x: x.upper() in
#     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
#     parsing time and lower memory usage.
# squeeze : bool, default False
#     If the parsed data only contains one column then return a Series.
#
#     .. deprecated:: 1.4.0
#         Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
#         the data.
# prefix : str, optional
#     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
#
#     .. deprecated:: 1.4.0
#        Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
# mangle_dupe_cols : bool, default True
#     Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
#     'X'...'X'. Passing in False will cause data to be overwritten if there
#     are duplicate names in the columns.
# dtype : Type name or dict of column -> type, optional
#     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
#     'c': 'Int64'}
#     Use `str` or `object` together with suitable `na_values` settings
#     to preserve and not interpret dtype.
#     If converters are specified, they will be applied INSTEAD
#     of dtype conversion.
# engine : {'c', 'python', 'pyarrow'}, optional
#     Parser engine to use. The C and pyarrow engines are faster, while the python engine
#     is currently more feature-complete. Multithreading is currently only supported by
#     the pyarrow engine.
#
#     .. versionadded:: 1.4.0
#
#         The "pyarrow" engine was added as an *experimental* engine, and some features
#         are unsupported, or may not work correctly, with this engine.
# converters : dict, optional
#     Dict of functions for converting values in certain columns. Keys can either
#     be integers or column labels.
# true_values : list, optional
#     Values to consider as True.
# false_values : list, optional
#     Values to consider as False.
# skipinitialspace : bool, default False
#     Skip spaces after delimiter.
# skiprows : list-like, int or callable, optional
#     Line numbers to skip (0-indexed) or number of lines to skip (int)
#     at the start of the file.
#
#     If callable, the callable function will be evaluated against the row
#     indices, returning True if the row should be skipped and False otherwise.
#     An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
# skipfooter : int, default 0
#     Number of lines at bottom of file to skip (Unsupported with engine='c').
# nrows : int, optional
#     Number of rows of file to read. Useful for reading pieces of large files.
# na_values : scalar, str, list-like, or dict, optional
#     Additional strings to recognize as NA/NaN. If dict passed, specific
#     per-column NA values.  By default the following values are interpreted as
#     NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
#     '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
#     'nan', 'null'.
# keep_default_na : bool, default True
#     Whether or not to include the default NaN values when parsing the data.
#     Depending on whether `na_values` is passed in, the behavior is as follows:
#
#     * If `keep_default_na` is True, and `na_values` are specified, `na_values`
#       is appended to the default NaN values used for parsing.
#     * If `keep_default_na` is True, and `na_values` are not specified, only
#       the default NaN values are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are specified, only
#       the NaN values specified `na_values` are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are not specified, no
#       strings will be parsed as NaN.
#
#     Note that if `na_filter` is passed in as False, the `keep_default_na` and
#     `na_values` parameters will be ignored.
# na_filter : bool, default True
#     Detect missing value markers (empty strings and the value of na_values). In
#     data without any NAs, passing na_filter=False can improve the performance
#     of reading a large file.
# verbose : bool, default False
#     Indicate number of NA values placed in non-numeric columns.
# skip_blank_lines : bool, default True
#     If True, skip over blank lines rather than interpreting as NaN values.
# parse_dates : bool or list of int or names or list of lists or dict, default False
#     The behavior is as follows:
#
#     * boolean. If True -> try parsing the index.
#     * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
#       each as a separate date column.
#     * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
#       a single date column.
#     * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
#       result 'foo'
#
#     If a column or index cannot be represented as an array of datetimes,
#     say because of an unparsable value or a mixture of timezones, the column
#     or index will be returned unaltered as an object data type. For
#     non-standard datetime parsing, use ``pd.to_datetime`` after
#     ``pd.read_csv``. To parse an index or column with a mixture of timezones,
#     specify ``date_parser`` to be a partially-applied
#     :func:`pandas.to_datetime` with ``utc=True``. See
#     :ref:`io.csv.mixed_timezones` for more.
#
#     Note: A fast-path exists for iso8601-formatted dates.
# infer_datetime_format : bool, default False
#     If True and `parse_dates` is enabled, pandas will attempt to infer the
#     format of the datetime strings in the columns, and if it can be inferred,
#     switch to a faster method of parsing them. In some cases this can increase
#     the parsing speed by 5-10x.
# keep_date_col : bool, default False
#     If True and `parse_dates` specifies combining multiple columns then
#     keep the original columns.
# date_parser : function, optional
#     Function to use for converting a sequence of string columns to an array of
#     datetime instances. The default uses ``dateutil.parser.parser`` to do the
#     conversion. Pandas will try to call `date_parser` in three different ways,
#     advancing to the next if an exception occurs: 1) Pass one or more arrays
#     (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
#     string values from the columns defined by `parse_dates` into a single array
#     and pass that; and 3) call `date_parser` once for each row using one or
#     more strings (corresponding to the columns defined by `parse_dates`) as
#     arguments.
# dayfirst : bool, default False
#     DD/MM format dates, international and European format.
# cache_dates : bool, default True
#     If True, use a cache of unique, converted dates to apply the datetime
#     conversion. May produce significant speed-up when parsing duplicate
#     date strings, especially ones with timezone offsets.
#
#     .. versionadded:: 0.25.0
# iterator : bool, default False
#     Return TextFileReader object for iteration or getting chunks with
#     ``get_chunk()``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# chunksize : int, optional
#     Return TextFileReader object for iteration.
#     See the `IO Tools docs
#     <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
#     for more information on ``iterator`` and ``chunksize``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# compression : str or dict, default 'infer'
#     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
#     'zip', the ZIP file must contain only one data file to be read in. Set to
#     ``None`` for no decompression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for Zstandard decompression using a
#     custom compression dictionary:
#     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
#
#     .. versionchanged:: 1.4.0 Zstandard support.
#
# thousands : str, optional
#     Thousands separator.
# decimal : str, default '.'
#     Character to recognize as decimal point (e.g. use ',' for European data).
# lineterminator : str (length 1), optional
#     Character to break file into lines. Only valid with C parser.
# quotechar : str (length 1), optional
#     The character used to denote the start and end of a quoted item. Quoted
#     items can include the delimiter and it will be ignored.
# quoting : int or csv.QUOTE_* instance, default 0
#     Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
#     QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
# doublequote : bool, default ``True``
#    When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
#    whether or not to interpret two consecutive quotechar elements INSIDE a
#    field as a single ``quotechar`` element.
# escapechar : str (length 1), optional
#     One-character string used to escape other characters.
# comment : str, optional
#     Indicates remainder of line should not be parsed. If found at the beginning
#     of a line, the line will be ignored altogether. This parameter must be a
#     single character. Like empty lines (as long as ``skip_blank_lines=True``),
#     fully commented lines are ignored by the parameter `header` but not by
#     `skiprows`. For example, if ``comment='#'``, parsing
#     ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
#     treated as the header.
# encoding : str, optional
#     Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
#     standard encodings
#     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
#
#     .. versionchanged:: 1.2
#
#        When ``encoding`` is ``None``, ``errors="replace"`` is passed to
#        ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
#        This behavior was previously only the case for ``engine="python"``.
#
#     .. versionchanged:: 1.3.0
#
#        ``encoding_errors`` is a new argument. ``encoding`` has no longer an
#        influence on how encoding errors are handled.
#
# encoding_errors : str, optional, default "strict"
#     How encoding errors are treated. `List of possible values
#     <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
#
#     .. versionadded:: 1.3.0
#
# dialect : str or csv.Dialect, optional
#     If provided, this parameter will override values (default or not) for the
#     following parameters: `delimiter`, `doublequote`, `escapechar`,
#     `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
#     override values, a ParserWarning will be issued. See csv.Dialect
#     documentation for more details.
# error_bad_lines : bool, optional, default ``None``
#     Lines with too many fields (e.g. a csv line with too many commas) will by
#     default cause an exception to be raised, and no DataFrame will be returned.
#     If False, then these "bad lines" will be dropped from the DataFrame that is
#     returned.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# warn_bad_lines : bool, optional, default ``None``
#     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
#     "bad line" will be output.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
#     Specifies what to do upon encountering a bad line (a line with too many fields).
#     Allowed values are :
#
#         - 'error', raise an Exception when a bad line is encountered.
#         - 'warn', raise a warning when a bad line is encountered and skip that line.
#         - 'skip', skip bad lines without raising or warning when they are encountered.
#
#     .. versionadded:: 1.3.0
#
#         - callable, function with signature
#           ``(bad_line: list[str]) -> list[str] | None`` that will process a single
#           bad line. ``bad_line`` is a list of strings split by the ``sep``.
#           If the function returns ``None``, the bad line will be ignored.
#           If the function returns a new list of strings with more elements than
#           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
#           Only supported when ``engine="python"``
#
#     .. versionadded:: 1.4.0
#
# delim_whitespace : bool, default False
#     Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
#     used as the sep. Equivalent to setting ``sep='\s+'``. If this option
#     is set to True, nothing should be passed in for the ``delimiter``
#     parameter.
# low_memory : bool, default True
#     Internally process the file in chunks, resulting in lower memory use
#     while parsing, but possibly mixed type inference.  To ensure no mixed
#     types either set False, or specify the type with the `dtype` parameter.
#     Note that the entire file is read into a single DataFrame regardless,
#     use the `chunksize` or `iterator` parameter to return the data in chunks.
#     (Only valid with C parser).
# memory_map : bool, default False
#     If a filepath is provided for `filepath_or_buffer`, map the file object
#     directly onto memory and access the data directly from there. Using this
#     option can improve performance because there is no longer any I/O overhead.
# float_precision : str, optional
#     Specifies which converter the C engine should use for floating-point
#     values. The options are ``None`` or 'high' for the ordinary converter,
#     'legacy' for the original lower precision pandas converter, and
#     'round_trip' for the round-trip converter.
#
#     .. versionchanged:: 1.2
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2
#
# Returns
# -------
# DataFrame or TextParser
#     A comma-separated values (csv) file is returned as two-dimensional
#     data structure with labeled axes.
#
# See Also
# --------
# DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_fwf : Read a table of fixed-width formatted lines into DataFrame.
#
# Examples
# --------
# >>> pd.read_csv('data.csv')  # doctest: +SKIP
#
# </code>
# <a href='#2'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#2'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
train.head(2)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>3. Data Preparation</h1>  <a id='3'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.io.parsers.readers.read_csv</u></summary>
# <blockquote>
# <code>
# Read a comma-separated values (csv) file into DataFrame.
#
# Also supports optionally iterating or breaking of the file
# into chunks.
#
# Additional help can be found in the online docs for
# `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
#
# Parameters
# ----------
# filepath_or_buffer : str, path object or file-like object
#     Any valid string path is acceptable. The string could be a URL. Valid
#     URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
#     expected. A local file could be: file://localhost/path/to/table.csv.
#
#     If you want to pass in a path object, pandas accepts any ``os.PathLike``.
#
#     By file-like object, we refer to objects with a ``read()`` method, such as
#     a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
# sep : str, default ','
#     Delimiter to use. If sep is None, the C engine cannot automatically detect
#     the separator, but the Python parsing engine can, meaning the latter will
#     be used and automatically detect the separator by Python's builtin sniffer
#     tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
#     different from ``'\s+'`` will be interpreted as regular expressions and
#     will also force the use of the Python parsing engine. Note that regex
#     delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
# delimiter : str, default ``None``
#     Alias for sep.
# header : int, list of int, None, default 'infer'
#     Row number(s) to use as the column names, and the start of the
#     data.  Default behavior is to infer the column names: if no names
#     are passed the behavior is identical to ``header=0`` and column
#     names are inferred from the first line of the file, if column
#     names are passed explicitly then the behavior is identical to
#     ``header=None``. Explicitly pass ``header=0`` to be able to
#     replace existing names. The header can be a list of integers that
#     specify row locations for a multi-index on the columns
#     e.g. [0,1,3]. Intervening rows that are not specified will be
#     skipped (e.g. 2 in this example is skipped). Note that this
#     parameter ignores commented lines and empty lines if
#     ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
#     data rather than the first line of the file.
# names : array-like, optional
#     List of column names to use. If the file contains a header row,
#     then you should explicitly pass ``header=0`` to override the column names.
#     Duplicates in this list are not allowed.
# index_col : int, str, sequence of int / str, or False, optional, default ``None``
#   Column(s) to use as the row labels of the ``DataFrame``, either given as
#   string name or column index. If a sequence of int / str is given, a
#   MultiIndex is used.
#
#   Note: ``index_col=False`` can be used to force pandas to *not* use the first
#   column as the index, e.g. when you have a malformed file with delimiters at
#   the end of each line.
# usecols : list-like or callable, optional
#     Return a subset of the columns. If list-like, all elements must either
#     be positional (i.e. integer indices into the document columns) or strings
#     that correspond to column names provided either by the user in `names` or
#     inferred from the document header row(s). If ``names`` are given, the document
#     header row(s) are not taken into account. For example, a valid list-like
#     `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
#     Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
#     To instantiate a DataFrame from ``data`` with element order preserved use
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
#     in ``['foo', 'bar']`` order or
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
#     for ``['bar', 'foo']`` order.
#
#     If callable, the callable function will be evaluated against the column
#     names, returning names where the callable function evaluates to True. An
#     example of a valid callable argument would be ``lambda x: x.upper() in
#     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
#     parsing time and lower memory usage.
# squeeze : bool, default False
#     If the parsed data only contains one column then return a Series.
#
#     .. deprecated:: 1.4.0
#         Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
#         the data.
# prefix : str, optional
#     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
#
#     .. deprecated:: 1.4.0
#        Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
# mangle_dupe_cols : bool, default True
#     Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
#     'X'...'X'. Passing in False will cause data to be overwritten if there
#     are duplicate names in the columns.
# dtype : Type name or dict of column -> type, optional
#     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
#     'c': 'Int64'}
#     Use `str` or `object` together with suitable `na_values` settings
#     to preserve and not interpret dtype.
#     If converters are specified, they will be applied INSTEAD
#     of dtype conversion.
# engine : {'c', 'python', 'pyarrow'}, optional
#     Parser engine to use. The C and pyarrow engines are faster, while the python engine
#     is currently more feature-complete. Multithreading is currently only supported by
#     the pyarrow engine.
#
#     .. versionadded:: 1.4.0
#
#         The "pyarrow" engine was added as an *experimental* engine, and some features
#         are unsupported, or may not work correctly, with this engine.
# converters : dict, optional
#     Dict of functions for converting values in certain columns. Keys can either
#     be integers or column labels.
# true_values : list, optional
#     Values to consider as True.
# false_values : list, optional
#     Values to consider as False.
# skipinitialspace : bool, default False
#     Skip spaces after delimiter.
# skiprows : list-like, int or callable, optional
#     Line numbers to skip (0-indexed) or number of lines to skip (int)
#     at the start of the file.
#
#     If callable, the callable function will be evaluated against the row
#     indices, returning True if the row should be skipped and False otherwise.
#     An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
# skipfooter : int, default 0
#     Number of lines at bottom of file to skip (Unsupported with engine='c').
# nrows : int, optional
#     Number of rows of file to read. Useful for reading pieces of large files.
# na_values : scalar, str, list-like, or dict, optional
#     Additional strings to recognize as NA/NaN. If dict passed, specific
#     per-column NA values.  By default the following values are interpreted as
#     NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
#     '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
#     'nan', 'null'.
# keep_default_na : bool, default True
#     Whether or not to include the default NaN values when parsing the data.
#     Depending on whether `na_values` is passed in, the behavior is as follows:
#
#     * If `keep_default_na` is True, and `na_values` are specified, `na_values`
#       is appended to the default NaN values used for parsing.
#     * If `keep_default_na` is True, and `na_values` are not specified, only
#       the default NaN values are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are specified, only
#       the NaN values specified `na_values` are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are not specified, no
#       strings will be parsed as NaN.
#
#     Note that if `na_filter` is passed in as False, the `keep_default_na` and
#     `na_values` parameters will be ignored.
# na_filter : bool, default True
#     Detect missing value markers (empty strings and the value of na_values). In
#     data without any NAs, passing na_filter=False can improve the performance
#     of reading a large file.
# verbose : bool, default False
#     Indicate number of NA values placed in non-numeric columns.
# skip_blank_lines : bool, default True
#     If True, skip over blank lines rather than interpreting as NaN values.
# parse_dates : bool or list of int or names or list of lists or dict, default False
#     The behavior is as follows:
#
#     * boolean. If True -> try parsing the index.
#     * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
#       each as a separate date column.
#     * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
#       a single date column.
#     * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
#       result 'foo'
#
#     If a column or index cannot be represented as an array of datetimes,
#     say because of an unparsable value or a mixture of timezones, the column
#     or index will be returned unaltered as an object data type. For
#     non-standard datetime parsing, use ``pd.to_datetime`` after
#     ``pd.read_csv``. To parse an index or column with a mixture of timezones,
#     specify ``date_parser`` to be a partially-applied
#     :func:`pandas.to_datetime` with ``utc=True``. See
#     :ref:`io.csv.mixed_timezones` for more.
#
#     Note: A fast-path exists for iso8601-formatted dates.
# infer_datetime_format : bool, default False
#     If True and `parse_dates` is enabled, pandas will attempt to infer the
#     format of the datetime strings in the columns, and if it can be inferred,
#     switch to a faster method of parsing them. In some cases this can increase
#     the parsing speed by 5-10x.
# keep_date_col : bool, default False
#     If True and `parse_dates` specifies combining multiple columns then
#     keep the original columns.
# date_parser : function, optional
#     Function to use for converting a sequence of string columns to an array of
#     datetime instances. The default uses ``dateutil.parser.parser`` to do the
#     conversion. Pandas will try to call `date_parser` in three different ways,
#     advancing to the next if an exception occurs: 1) Pass one or more arrays
#     (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
#     string values from the columns defined by `parse_dates` into a single array
#     and pass that; and 3) call `date_parser` once for each row using one or
#     more strings (corresponding to the columns defined by `parse_dates`) as
#     arguments.
# dayfirst : bool, default False
#     DD/MM format dates, international and European format.
# cache_dates : bool, default True
#     If True, use a cache of unique, converted dates to apply the datetime
#     conversion. May produce significant speed-up when parsing duplicate
#     date strings, especially ones with timezone offsets.
#
#     .. versionadded:: 0.25.0
# iterator : bool, default False
#     Return TextFileReader object for iteration or getting chunks with
#     ``get_chunk()``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# chunksize : int, optional
#     Return TextFileReader object for iteration.
#     See the `IO Tools docs
#     <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
#     for more information on ``iterator`` and ``chunksize``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# compression : str or dict, default 'infer'
#     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
#     'zip', the ZIP file must contain only one data file to be read in. Set to
#     ``None`` for no decompression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for Zstandard decompression using a
#     custom compression dictionary:
#     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
#
#     .. versionchanged:: 1.4.0 Zstandard support.
#
# thousands : str, optional
#     Thousands separator.
# decimal : str, default '.'
#     Character to recognize as decimal point (e.g. use ',' for European data).
# lineterminator : str (length 1), optional
#     Character to break file into lines. Only valid with C parser.
# quotechar : str (length 1), optional
#     The character used to denote the start and end of a quoted item. Quoted
#     items can include the delimiter and it will be ignored.
# quoting : int or csv.QUOTE_* instance, default 0
#     Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
#     QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
# doublequote : bool, default ``True``
#    When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
#    whether or not to interpret two consecutive quotechar elements INSIDE a
#    field as a single ``quotechar`` element.
# escapechar : str (length 1), optional
#     One-character string used to escape other characters.
# comment : str, optional
#     Indicates remainder of line should not be parsed. If found at the beginning
#     of a line, the line will be ignored altogether. This parameter must be a
#     single character. Like empty lines (as long as ``skip_blank_lines=True``),
#     fully commented lines are ignored by the parameter `header` but not by
#     `skiprows`. For example, if ``comment='#'``, parsing
#     ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
#     treated as the header.
# encoding : str, optional
#     Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
#     standard encodings
#     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
#
#     .. versionchanged:: 1.2
#
#        When ``encoding`` is ``None``, ``errors="replace"`` is passed to
#        ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
#        This behavior was previously only the case for ``engine="python"``.
#
#     .. versionchanged:: 1.3.0
#
#        ``encoding_errors`` is a new argument. ``encoding`` has no longer an
#        influence on how encoding errors are handled.
#
# encoding_errors : str, optional, default "strict"
#     How encoding errors are treated. `List of possible values
#     <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
#
#     .. versionadded:: 1.3.0
#
# dialect : str or csv.Dialect, optional
#     If provided, this parameter will override values (default or not) for the
#     following parameters: `delimiter`, `doublequote`, `escapechar`,
#     `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
#     override values, a ParserWarning will be issued. See csv.Dialect
#     documentation for more details.
# error_bad_lines : bool, optional, default ``None``
#     Lines with too many fields (e.g. a csv line with too many commas) will by
#     default cause an exception to be raised, and no DataFrame will be returned.
#     If False, then these "bad lines" will be dropped from the DataFrame that is
#     returned.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# warn_bad_lines : bool, optional, default ``None``
#     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
#     "bad line" will be output.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
#     Specifies what to do upon encountering a bad line (a line with too many fields).
#     Allowed values are :
#
#         - 'error', raise an Exception when a bad line is encountered.
#         - 'warn', raise a warning when a bad line is encountered and skip that line.
#         - 'skip', skip bad lines without raising or warning when they are encountered.
#
#     .. versionadded:: 1.3.0
#
#         - callable, function with signature
#           ``(bad_line: list[str]) -> list[str] | None`` that will process a single
#           bad line. ``bad_line`` is a list of strings split by the ``sep``.
#           If the function returns ``None``, the bad line will be ignored.
#           If the function returns a new list of strings with more elements than
#           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
#           Only supported when ``engine="python"``
#
#     .. versionadded:: 1.4.0
#
# delim_whitespace : bool, default False
#     Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
#     used as the sep. Equivalent to setting ``sep='\s+'``. If this option
#     is set to True, nothing should be passed in for the ``delimiter``
#     parameter.
# low_memory : bool, default True
#     Internally process the file in chunks, resulting in lower memory use
#     while parsing, but possibly mixed type inference.  To ensure no mixed
#     types either set False, or specify the type with the `dtype` parameter.
#     Note that the entire file is read into a single DataFrame regardless,
#     use the `chunksize` or `iterator` parameter to return the data in chunks.
#     (Only valid with C parser).
# memory_map : bool, default False
#     If a filepath is provided for `filepath_or_buffer`, map the file object
#     directly onto memory and access the data directly from there. Using this
#     option can improve performance because there is no longer any I/O overhead.
# float_precision : str, optional
#     Specifies which converter the C engine should use for floating-point
#     values. The options are ``None`` or 'high' for the ordinary converter,
#     'legacy' for the original lower precision pandas converter, and
#     'round_trip' for the round-trip converter.
#
#     .. versionchanged:: 1.2
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2
#
# Returns
# -------
# DataFrame or TextParser
#     A comma-separated values (csv) file is returned as two-dimensional
#     data structure with labeled axes.
#
# See Also
# --------
# DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_fwf : Read a table of fixed-width formatted lines into DataFrame.
#
# Examples
# --------
# >>> pd.read_csv('data.csv')  # doctest: +SKIP
#
# </code>
# <a href='#3'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#3'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
item = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
item.head(2)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>4. Data Preparation</h1>  <a id='4'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.io.parsers.readers.read_csv</u></summary>
# <blockquote>
# <code>
# Read a comma-separated values (csv) file into DataFrame.
#
# Also supports optionally iterating or breaking of the file
# into chunks.
#
# Additional help can be found in the online docs for
# `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
#
# Parameters
# ----------
# filepath_or_buffer : str, path object or file-like object
#     Any valid string path is acceptable. The string could be a URL. Valid
#     URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
#     expected. A local file could be: file://localhost/path/to/table.csv.
#
#     If you want to pass in a path object, pandas accepts any ``os.PathLike``.
#
#     By file-like object, we refer to objects with a ``read()`` method, such as
#     a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
# sep : str, default ','
#     Delimiter to use. If sep is None, the C engine cannot automatically detect
#     the separator, but the Python parsing engine can, meaning the latter will
#     be used and automatically detect the separator by Python's builtin sniffer
#     tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
#     different from ``'\s+'`` will be interpreted as regular expressions and
#     will also force the use of the Python parsing engine. Note that regex
#     delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
# delimiter : str, default ``None``
#     Alias for sep.
# header : int, list of int, None, default 'infer'
#     Row number(s) to use as the column names, and the start of the
#     data.  Default behavior is to infer the column names: if no names
#     are passed the behavior is identical to ``header=0`` and column
#     names are inferred from the first line of the file, if column
#     names are passed explicitly then the behavior is identical to
#     ``header=None``. Explicitly pass ``header=0`` to be able to
#     replace existing names. The header can be a list of integers that
#     specify row locations for a multi-index on the columns
#     e.g. [0,1,3]. Intervening rows that are not specified will be
#     skipped (e.g. 2 in this example is skipped). Note that this
#     parameter ignores commented lines and empty lines if
#     ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
#     data rather than the first line of the file.
# names : array-like, optional
#     List of column names to use. If the file contains a header row,
#     then you should explicitly pass ``header=0`` to override the column names.
#     Duplicates in this list are not allowed.
# index_col : int, str, sequence of int / str, or False, optional, default ``None``
#   Column(s) to use as the row labels of the ``DataFrame``, either given as
#   string name or column index. If a sequence of int / str is given, a
#   MultiIndex is used.
#
#   Note: ``index_col=False`` can be used to force pandas to *not* use the first
#   column as the index, e.g. when you have a malformed file with delimiters at
#   the end of each line.
# usecols : list-like or callable, optional
#     Return a subset of the columns. If list-like, all elements must either
#     be positional (i.e. integer indices into the document columns) or strings
#     that correspond to column names provided either by the user in `names` or
#     inferred from the document header row(s). If ``names`` are given, the document
#     header row(s) are not taken into account. For example, a valid list-like
#     `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
#     Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
#     To instantiate a DataFrame from ``data`` with element order preserved use
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
#     in ``['foo', 'bar']`` order or
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
#     for ``['bar', 'foo']`` order.
#
#     If callable, the callable function will be evaluated against the column
#     names, returning names where the callable function evaluates to True. An
#     example of a valid callable argument would be ``lambda x: x.upper() in
#     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
#     parsing time and lower memory usage.
# squeeze : bool, default False
#     If the parsed data only contains one column then return a Series.
#
#     .. deprecated:: 1.4.0
#         Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
#         the data.
# prefix : str, optional
#     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
#
#     .. deprecated:: 1.4.0
#        Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
# mangle_dupe_cols : bool, default True
#     Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
#     'X'...'X'. Passing in False will cause data to be overwritten if there
#     are duplicate names in the columns.
# dtype : Type name or dict of column -> type, optional
#     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
#     'c': 'Int64'}
#     Use `str` or `object` together with suitable `na_values` settings
#     to preserve and not interpret dtype.
#     If converters are specified, they will be applied INSTEAD
#     of dtype conversion.
# engine : {'c', 'python', 'pyarrow'}, optional
#     Parser engine to use. The C and pyarrow engines are faster, while the python engine
#     is currently more feature-complete. Multithreading is currently only supported by
#     the pyarrow engine.
#
#     .. versionadded:: 1.4.0
#
#         The "pyarrow" engine was added as an *experimental* engine, and some features
#         are unsupported, or may not work correctly, with this engine.
# converters : dict, optional
#     Dict of functions for converting values in certain columns. Keys can either
#     be integers or column labels.
# true_values : list, optional
#     Values to consider as True.
# false_values : list, optional
#     Values to consider as False.
# skipinitialspace : bool, default False
#     Skip spaces after delimiter.
# skiprows : list-like, int or callable, optional
#     Line numbers to skip (0-indexed) or number of lines to skip (int)
#     at the start of the file.
#
#     If callable, the callable function will be evaluated against the row
#     indices, returning True if the row should be skipped and False otherwise.
#     An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
# skipfooter : int, default 0
#     Number of lines at bottom of file to skip (Unsupported with engine='c').
# nrows : int, optional
#     Number of rows of file to read. Useful for reading pieces of large files.
# na_values : scalar, str, list-like, or dict, optional
#     Additional strings to recognize as NA/NaN. If dict passed, specific
#     per-column NA values.  By default the following values are interpreted as
#     NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
#     '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
#     'nan', 'null'.
# keep_default_na : bool, default True
#     Whether or not to include the default NaN values when parsing the data.
#     Depending on whether `na_values` is passed in, the behavior is as follows:
#
#     * If `keep_default_na` is True, and `na_values` are specified, `na_values`
#       is appended to the default NaN values used for parsing.
#     * If `keep_default_na` is True, and `na_values` are not specified, only
#       the default NaN values are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are specified, only
#       the NaN values specified `na_values` are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are not specified, no
#       strings will be parsed as NaN.
#
#     Note that if `na_filter` is passed in as False, the `keep_default_na` and
#     `na_values` parameters will be ignored.
# na_filter : bool, default True
#     Detect missing value markers (empty strings and the value of na_values). In
#     data without any NAs, passing na_filter=False can improve the performance
#     of reading a large file.
# verbose : bool, default False
#     Indicate number of NA values placed in non-numeric columns.
# skip_blank_lines : bool, default True
#     If True, skip over blank lines rather than interpreting as NaN values.
# parse_dates : bool or list of int or names or list of lists or dict, default False
#     The behavior is as follows:
#
#     * boolean. If True -> try parsing the index.
#     * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
#       each as a separate date column.
#     * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
#       a single date column.
#     * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
#       result 'foo'
#
#     If a column or index cannot be represented as an array of datetimes,
#     say because of an unparsable value or a mixture of timezones, the column
#     or index will be returned unaltered as an object data type. For
#     non-standard datetime parsing, use ``pd.to_datetime`` after
#     ``pd.read_csv``. To parse an index or column with a mixture of timezones,
#     specify ``date_parser`` to be a partially-applied
#     :func:`pandas.to_datetime` with ``utc=True``. See
#     :ref:`io.csv.mixed_timezones` for more.
#
#     Note: A fast-path exists for iso8601-formatted dates.
# infer_datetime_format : bool, default False
#     If True and `parse_dates` is enabled, pandas will attempt to infer the
#     format of the datetime strings in the columns, and if it can be inferred,
#     switch to a faster method of parsing them. In some cases this can increase
#     the parsing speed by 5-10x.
# keep_date_col : bool, default False
#     If True and `parse_dates` specifies combining multiple columns then
#     keep the original columns.
# date_parser : function, optional
#     Function to use for converting a sequence of string columns to an array of
#     datetime instances. The default uses ``dateutil.parser.parser`` to do the
#     conversion. Pandas will try to call `date_parser` in three different ways,
#     advancing to the next if an exception occurs: 1) Pass one or more arrays
#     (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
#     string values from the columns defined by `parse_dates` into a single array
#     and pass that; and 3) call `date_parser` once for each row using one or
#     more strings (corresponding to the columns defined by `parse_dates`) as
#     arguments.
# dayfirst : bool, default False
#     DD/MM format dates, international and European format.
# cache_dates : bool, default True
#     If True, use a cache of unique, converted dates to apply the datetime
#     conversion. May produce significant speed-up when parsing duplicate
#     date strings, especially ones with timezone offsets.
#
#     .. versionadded:: 0.25.0
# iterator : bool, default False
#     Return TextFileReader object for iteration or getting chunks with
#     ``get_chunk()``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# chunksize : int, optional
#     Return TextFileReader object for iteration.
#     See the `IO Tools docs
#     <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
#     for more information on ``iterator`` and ``chunksize``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# compression : str or dict, default 'infer'
#     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
#     'zip', the ZIP file must contain only one data file to be read in. Set to
#     ``None`` for no decompression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for Zstandard decompression using a
#     custom compression dictionary:
#     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
#
#     .. versionchanged:: 1.4.0 Zstandard support.
#
# thousands : str, optional
#     Thousands separator.
# decimal : str, default '.'
#     Character to recognize as decimal point (e.g. use ',' for European data).
# lineterminator : str (length 1), optional
#     Character to break file into lines. Only valid with C parser.
# quotechar : str (length 1), optional
#     The character used to denote the start and end of a quoted item. Quoted
#     items can include the delimiter and it will be ignored.
# quoting : int or csv.QUOTE_* instance, default 0
#     Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
#     QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
# doublequote : bool, default ``True``
#    When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
#    whether or not to interpret two consecutive quotechar elements INSIDE a
#    field as a single ``quotechar`` element.
# escapechar : str (length 1), optional
#     One-character string used to escape other characters.
# comment : str, optional
#     Indicates remainder of line should not be parsed. If found at the beginning
#     of a line, the line will be ignored altogether. This parameter must be a
#     single character. Like empty lines (as long as ``skip_blank_lines=True``),
#     fully commented lines are ignored by the parameter `header` but not by
#     `skiprows`. For example, if ``comment='#'``, parsing
#     ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
#     treated as the header.
# encoding : str, optional
#     Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
#     standard encodings
#     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
#
#     .. versionchanged:: 1.2
#
#        When ``encoding`` is ``None``, ``errors="replace"`` is passed to
#        ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
#        This behavior was previously only the case for ``engine="python"``.
#
#     .. versionchanged:: 1.3.0
#
#        ``encoding_errors`` is a new argument. ``encoding`` has no longer an
#        influence on how encoding errors are handled.
#
# encoding_errors : str, optional, default "strict"
#     How encoding errors are treated. `List of possible values
#     <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
#
#     .. versionadded:: 1.3.0
#
# dialect : str or csv.Dialect, optional
#     If provided, this parameter will override values (default or not) for the
#     following parameters: `delimiter`, `doublequote`, `escapechar`,
#     `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
#     override values, a ParserWarning will be issued. See csv.Dialect
#     documentation for more details.
# error_bad_lines : bool, optional, default ``None``
#     Lines with too many fields (e.g. a csv line with too many commas) will by
#     default cause an exception to be raised, and no DataFrame will be returned.
#     If False, then these "bad lines" will be dropped from the DataFrame that is
#     returned.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# warn_bad_lines : bool, optional, default ``None``
#     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
#     "bad line" will be output.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
#     Specifies what to do upon encountering a bad line (a line with too many fields).
#     Allowed values are :
#
#         - 'error', raise an Exception when a bad line is encountered.
#         - 'warn', raise a warning when a bad line is encountered and skip that line.
#         - 'skip', skip bad lines without raising or warning when they are encountered.
#
#     .. versionadded:: 1.3.0
#
#         - callable, function with signature
#           ``(bad_line: list[str]) -> list[str] | None`` that will process a single
#           bad line. ``bad_line`` is a list of strings split by the ``sep``.
#           If the function returns ``None``, the bad line will be ignored.
#           If the function returns a new list of strings with more elements than
#           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
#           Only supported when ``engine="python"``
#
#     .. versionadded:: 1.4.0
#
# delim_whitespace : bool, default False
#     Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
#     used as the sep. Equivalent to setting ``sep='\s+'``. If this option
#     is set to True, nothing should be passed in for the ``delimiter``
#     parameter.
# low_memory : bool, default True
#     Internally process the file in chunks, resulting in lower memory use
#     while parsing, but possibly mixed type inference.  To ensure no mixed
#     types either set False, or specify the type with the `dtype` parameter.
#     Note that the entire file is read into a single DataFrame regardless,
#     use the `chunksize` or `iterator` parameter to return the data in chunks.
#     (Only valid with C parser).
# memory_map : bool, default False
#     If a filepath is provided for `filepath_or_buffer`, map the file object
#     directly onto memory and access the data directly from there. Using this
#     option can improve performance because there is no longer any I/O overhead.
# float_precision : str, optional
#     Specifies which converter the C engine should use for floating-point
#     values. The options are ``None`` or 'high' for the ordinary converter,
#     'legacy' for the original lower precision pandas converter, and
#     'round_trip' for the round-trip converter.
#
#     .. versionchanged:: 1.2
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2
#
# Returns
# -------
# DataFrame or TextParser
#     A comma-separated values (csv) file is returned as two-dimensional
#     data structure with labeled axes.
#
# See Also
# --------
# DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_fwf : Read a table of fixed-width formatted lines into DataFrame.
#
# Examples
# --------
# >>> pd.read_csv('data.csv')  # doctest: +SKIP
#
# </code>
# <a href='#4'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#4'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
cat = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
cat.head(2)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>5. Data Preparation</h1>  <a id='5'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.io.parsers.readers.read_csv</u></summary>
# <blockquote>
# <code>
# Read a comma-separated values (csv) file into DataFrame.
#
# Also supports optionally iterating or breaking of the file
# into chunks.
#
# Additional help can be found in the online docs for
# `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
#
# Parameters
# ----------
# filepath_or_buffer : str, path object or file-like object
#     Any valid string path is acceptable. The string could be a URL. Valid
#     URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
#     expected. A local file could be: file://localhost/path/to/table.csv.
#
#     If you want to pass in a path object, pandas accepts any ``os.PathLike``.
#
#     By file-like object, we refer to objects with a ``read()`` method, such as
#     a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
# sep : str, default ','
#     Delimiter to use. If sep is None, the C engine cannot automatically detect
#     the separator, but the Python parsing engine can, meaning the latter will
#     be used and automatically detect the separator by Python's builtin sniffer
#     tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
#     different from ``'\s+'`` will be interpreted as regular expressions and
#     will also force the use of the Python parsing engine. Note that regex
#     delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
# delimiter : str, default ``None``
#     Alias for sep.
# header : int, list of int, None, default 'infer'
#     Row number(s) to use as the column names, and the start of the
#     data.  Default behavior is to infer the column names: if no names
#     are passed the behavior is identical to ``header=0`` and column
#     names are inferred from the first line of the file, if column
#     names are passed explicitly then the behavior is identical to
#     ``header=None``. Explicitly pass ``header=0`` to be able to
#     replace existing names. The header can be a list of integers that
#     specify row locations for a multi-index on the columns
#     e.g. [0,1,3]. Intervening rows that are not specified will be
#     skipped (e.g. 2 in this example is skipped). Note that this
#     parameter ignores commented lines and empty lines if
#     ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
#     data rather than the first line of the file.
# names : array-like, optional
#     List of column names to use. If the file contains a header row,
#     then you should explicitly pass ``header=0`` to override the column names.
#     Duplicates in this list are not allowed.
# index_col : int, str, sequence of int / str, or False, optional, default ``None``
#   Column(s) to use as the row labels of the ``DataFrame``, either given as
#   string name or column index. If a sequence of int / str is given, a
#   MultiIndex is used.
#
#   Note: ``index_col=False`` can be used to force pandas to *not* use the first
#   column as the index, e.g. when you have a malformed file with delimiters at
#   the end of each line.
# usecols : list-like or callable, optional
#     Return a subset of the columns. If list-like, all elements must either
#     be positional (i.e. integer indices into the document columns) or strings
#     that correspond to column names provided either by the user in `names` or
#     inferred from the document header row(s). If ``names`` are given, the document
#     header row(s) are not taken into account. For example, a valid list-like
#     `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
#     Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
#     To instantiate a DataFrame from ``data`` with element order preserved use
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
#     in ``['foo', 'bar']`` order or
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
#     for ``['bar', 'foo']`` order.
#
#     If callable, the callable function will be evaluated against the column
#     names, returning names where the callable function evaluates to True. An
#     example of a valid callable argument would be ``lambda x: x.upper() in
#     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
#     parsing time and lower memory usage.
# squeeze : bool, default False
#     If the parsed data only contains one column then return a Series.
#
#     .. deprecated:: 1.4.0
#         Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
#         the data.
# prefix : str, optional
#     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
#
#     .. deprecated:: 1.4.0
#        Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
# mangle_dupe_cols : bool, default True
#     Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
#     'X'...'X'. Passing in False will cause data to be overwritten if there
#     are duplicate names in the columns.
# dtype : Type name or dict of column -> type, optional
#     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
#     'c': 'Int64'}
#     Use `str` or `object` together with suitable `na_values` settings
#     to preserve and not interpret dtype.
#     If converters are specified, they will be applied INSTEAD
#     of dtype conversion.
# engine : {'c', 'python', 'pyarrow'}, optional
#     Parser engine to use. The C and pyarrow engines are faster, while the python engine
#     is currently more feature-complete. Multithreading is currently only supported by
#     the pyarrow engine.
#
#     .. versionadded:: 1.4.0
#
#         The "pyarrow" engine was added as an *experimental* engine, and some features
#         are unsupported, or may not work correctly, with this engine.
# converters : dict, optional
#     Dict of functions for converting values in certain columns. Keys can either
#     be integers or column labels.
# true_values : list, optional
#     Values to consider as True.
# false_values : list, optional
#     Values to consider as False.
# skipinitialspace : bool, default False
#     Skip spaces after delimiter.
# skiprows : list-like, int or callable, optional
#     Line numbers to skip (0-indexed) or number of lines to skip (int)
#     at the start of the file.
#
#     If callable, the callable function will be evaluated against the row
#     indices, returning True if the row should be skipped and False otherwise.
#     An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
# skipfooter : int, default 0
#     Number of lines at bottom of file to skip (Unsupported with engine='c').
# nrows : int, optional
#     Number of rows of file to read. Useful for reading pieces of large files.
# na_values : scalar, str, list-like, or dict, optional
#     Additional strings to recognize as NA/NaN. If dict passed, specific
#     per-column NA values.  By default the following values are interpreted as
#     NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
#     '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
#     'nan', 'null'.
# keep_default_na : bool, default True
#     Whether or not to include the default NaN values when parsing the data.
#     Depending on whether `na_values` is passed in, the behavior is as follows:
#
#     * If `keep_default_na` is True, and `na_values` are specified, `na_values`
#       is appended to the default NaN values used for parsing.
#     * If `keep_default_na` is True, and `na_values` are not specified, only
#       the default NaN values are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are specified, only
#       the NaN values specified `na_values` are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are not specified, no
#       strings will be parsed as NaN.
#
#     Note that if `na_filter` is passed in as False, the `keep_default_na` and
#     `na_values` parameters will be ignored.
# na_filter : bool, default True
#     Detect missing value markers (empty strings and the value of na_values). In
#     data without any NAs, passing na_filter=False can improve the performance
#     of reading a large file.
# verbose : bool, default False
#     Indicate number of NA values placed in non-numeric columns.
# skip_blank_lines : bool, default True
#     If True, skip over blank lines rather than interpreting as NaN values.
# parse_dates : bool or list of int or names or list of lists or dict, default False
#     The behavior is as follows:
#
#     * boolean. If True -> try parsing the index.
#     * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
#       each as a separate date column.
#     * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
#       a single date column.
#     * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
#       result 'foo'
#
#     If a column or index cannot be represented as an array of datetimes,
#     say because of an unparsable value or a mixture of timezones, the column
#     or index will be returned unaltered as an object data type. For
#     non-standard datetime parsing, use ``pd.to_datetime`` after
#     ``pd.read_csv``. To parse an index or column with a mixture of timezones,
#     specify ``date_parser`` to be a partially-applied
#     :func:`pandas.to_datetime` with ``utc=True``. See
#     :ref:`io.csv.mixed_timezones` for more.
#
#     Note: A fast-path exists for iso8601-formatted dates.
# infer_datetime_format : bool, default False
#     If True and `parse_dates` is enabled, pandas will attempt to infer the
#     format of the datetime strings in the columns, and if it can be inferred,
#     switch to a faster method of parsing them. In some cases this can increase
#     the parsing speed by 5-10x.
# keep_date_col : bool, default False
#     If True and `parse_dates` specifies combining multiple columns then
#     keep the original columns.
# date_parser : function, optional
#     Function to use for converting a sequence of string columns to an array of
#     datetime instances. The default uses ``dateutil.parser.parser`` to do the
#     conversion. Pandas will try to call `date_parser` in three different ways,
#     advancing to the next if an exception occurs: 1) Pass one or more arrays
#     (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
#     string values from the columns defined by `parse_dates` into a single array
#     and pass that; and 3) call `date_parser` once for each row using one or
#     more strings (corresponding to the columns defined by `parse_dates`) as
#     arguments.
# dayfirst : bool, default False
#     DD/MM format dates, international and European format.
# cache_dates : bool, default True
#     If True, use a cache of unique, converted dates to apply the datetime
#     conversion. May produce significant speed-up when parsing duplicate
#     date strings, especially ones with timezone offsets.
#
#     .. versionadded:: 0.25.0
# iterator : bool, default False
#     Return TextFileReader object for iteration or getting chunks with
#     ``get_chunk()``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# chunksize : int, optional
#     Return TextFileReader object for iteration.
#     See the `IO Tools docs
#     <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
#     for more information on ``iterator`` and ``chunksize``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# compression : str or dict, default 'infer'
#     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
#     'zip', the ZIP file must contain only one data file to be read in. Set to
#     ``None`` for no decompression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for Zstandard decompression using a
#     custom compression dictionary:
#     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
#
#     .. versionchanged:: 1.4.0 Zstandard support.
#
# thousands : str, optional
#     Thousands separator.
# decimal : str, default '.'
#     Character to recognize as decimal point (e.g. use ',' for European data).
# lineterminator : str (length 1), optional
#     Character to break file into lines. Only valid with C parser.
# quotechar : str (length 1), optional
#     The character used to denote the start and end of a quoted item. Quoted
#     items can include the delimiter and it will be ignored.
# quoting : int or csv.QUOTE_* instance, default 0
#     Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
#     QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
# doublequote : bool, default ``True``
#    When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
#    whether or not to interpret two consecutive quotechar elements INSIDE a
#    field as a single ``quotechar`` element.
# escapechar : str (length 1), optional
#     One-character string used to escape other characters.
# comment : str, optional
#     Indicates remainder of line should not be parsed. If found at the beginning
#     of a line, the line will be ignored altogether. This parameter must be a
#     single character. Like empty lines (as long as ``skip_blank_lines=True``),
#     fully commented lines are ignored by the parameter `header` but not by
#     `skiprows`. For example, if ``comment='#'``, parsing
#     ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
#     treated as the header.
# encoding : str, optional
#     Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
#     standard encodings
#     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
#
#     .. versionchanged:: 1.2
#
#        When ``encoding`` is ``None``, ``errors="replace"`` is passed to
#        ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
#        This behavior was previously only the case for ``engine="python"``.
#
#     .. versionchanged:: 1.3.0
#
#        ``encoding_errors`` is a new argument. ``encoding`` has no longer an
#        influence on how encoding errors are handled.
#
# encoding_errors : str, optional, default "strict"
#     How encoding errors are treated. `List of possible values
#     <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
#
#     .. versionadded:: 1.3.0
#
# dialect : str or csv.Dialect, optional
#     If provided, this parameter will override values (default or not) for the
#     following parameters: `delimiter`, `doublequote`, `escapechar`,
#     `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
#     override values, a ParserWarning will be issued. See csv.Dialect
#     documentation for more details.
# error_bad_lines : bool, optional, default ``None``
#     Lines with too many fields (e.g. a csv line with too many commas) will by
#     default cause an exception to be raised, and no DataFrame will be returned.
#     If False, then these "bad lines" will be dropped from the DataFrame that is
#     returned.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# warn_bad_lines : bool, optional, default ``None``
#     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
#     "bad line" will be output.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
#     Specifies what to do upon encountering a bad line (a line with too many fields).
#     Allowed values are :
#
#         - 'error', raise an Exception when a bad line is encountered.
#         - 'warn', raise a warning when a bad line is encountered and skip that line.
#         - 'skip', skip bad lines without raising or warning when they are encountered.
#
#     .. versionadded:: 1.3.0
#
#         - callable, function with signature
#           ``(bad_line: list[str]) -> list[str] | None`` that will process a single
#           bad line. ``bad_line`` is a list of strings split by the ``sep``.
#           If the function returns ``None``, the bad line will be ignored.
#           If the function returns a new list of strings with more elements than
#           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
#           Only supported when ``engine="python"``
#
#     .. versionadded:: 1.4.0
#
# delim_whitespace : bool, default False
#     Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
#     used as the sep. Equivalent to setting ``sep='\s+'``. If this option
#     is set to True, nothing should be passed in for the ``delimiter``
#     parameter.
# low_memory : bool, default True
#     Internally process the file in chunks, resulting in lower memory use
#     while parsing, but possibly mixed type inference.  To ensure no mixed
#     types either set False, or specify the type with the `dtype` parameter.
#     Note that the entire file is read into a single DataFrame regardless,
#     use the `chunksize` or `iterator` parameter to return the data in chunks.
#     (Only valid with C parser).
# memory_map : bool, default False
#     If a filepath is provided for `filepath_or_buffer`, map the file object
#     directly onto memory and access the data directly from there. Using this
#     option can improve performance because there is no longer any I/O overhead.
# float_precision : str, optional
#     Specifies which converter the C engine should use for floating-point
#     values. The options are ``None`` or 'high' for the ordinary converter,
#     'legacy' for the original lower precision pandas converter, and
#     'round_trip' for the round-trip converter.
#
#     .. versionchanged:: 1.2
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2
#
# Returns
# -------
# DataFrame or TextParser
#     A comma-separated values (csv) file is returned as two-dimensional
#     data structure with labeled axes.
#
# See Also
# --------
# DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_fwf : Read a table of fixed-width formatted lines into DataFrame.
#
# Examples
# --------
# >>> pd.read_csv('data.csv')  # doctest: +SKIP
#
# </code>
# <a href='#5'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#5'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
shop = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
shop.head(2)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>6. Data Preparation</h1>  <a id='6'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.io.parsers.readers.read_csv</u></summary>
# <blockquote>
# <code>
# Read a comma-separated values (csv) file into DataFrame.
#
# Also supports optionally iterating or breaking of the file
# into chunks.
#
# Additional help can be found in the online docs for
# `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
#
# Parameters
# ----------
# filepath_or_buffer : str, path object or file-like object
#     Any valid string path is acceptable. The string could be a URL. Valid
#     URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
#     expected. A local file could be: file://localhost/path/to/table.csv.
#
#     If you want to pass in a path object, pandas accepts any ``os.PathLike``.
#
#     By file-like object, we refer to objects with a ``read()`` method, such as
#     a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
# sep : str, default ','
#     Delimiter to use. If sep is None, the C engine cannot automatically detect
#     the separator, but the Python parsing engine can, meaning the latter will
#     be used and automatically detect the separator by Python's builtin sniffer
#     tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
#     different from ``'\s+'`` will be interpreted as regular expressions and
#     will also force the use of the Python parsing engine. Note that regex
#     delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
# delimiter : str, default ``None``
#     Alias for sep.
# header : int, list of int, None, default 'infer'
#     Row number(s) to use as the column names, and the start of the
#     data.  Default behavior is to infer the column names: if no names
#     are passed the behavior is identical to ``header=0`` and column
#     names are inferred from the first line of the file, if column
#     names are passed explicitly then the behavior is identical to
#     ``header=None``. Explicitly pass ``header=0`` to be able to
#     replace existing names. The header can be a list of integers that
#     specify row locations for a multi-index on the columns
#     e.g. [0,1,3]. Intervening rows that are not specified will be
#     skipped (e.g. 2 in this example is skipped). Note that this
#     parameter ignores commented lines and empty lines if
#     ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
#     data rather than the first line of the file.
# names : array-like, optional
#     List of column names to use. If the file contains a header row,
#     then you should explicitly pass ``header=0`` to override the column names.
#     Duplicates in this list are not allowed.
# index_col : int, str, sequence of int / str, or False, optional, default ``None``
#   Column(s) to use as the row labels of the ``DataFrame``, either given as
#   string name or column index. If a sequence of int / str is given, a
#   MultiIndex is used.
#
#   Note: ``index_col=False`` can be used to force pandas to *not* use the first
#   column as the index, e.g. when you have a malformed file with delimiters at
#   the end of each line.
# usecols : list-like or callable, optional
#     Return a subset of the columns. If list-like, all elements must either
#     be positional (i.e. integer indices into the document columns) or strings
#     that correspond to column names provided either by the user in `names` or
#     inferred from the document header row(s). If ``names`` are given, the document
#     header row(s) are not taken into account. For example, a valid list-like
#     `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
#     Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
#     To instantiate a DataFrame from ``data`` with element order preserved use
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
#     in ``['foo', 'bar']`` order or
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
#     for ``['bar', 'foo']`` order.
#
#     If callable, the callable function will be evaluated against the column
#     names, returning names where the callable function evaluates to True. An
#     example of a valid callable argument would be ``lambda x: x.upper() in
#     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
#     parsing time and lower memory usage.
# squeeze : bool, default False
#     If the parsed data only contains one column then return a Series.
#
#     .. deprecated:: 1.4.0
#         Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
#         the data.
# prefix : str, optional
#     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
#
#     .. deprecated:: 1.4.0
#        Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
# mangle_dupe_cols : bool, default True
#     Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
#     'X'...'X'. Passing in False will cause data to be overwritten if there
#     are duplicate names in the columns.
# dtype : Type name or dict of column -> type, optional
#     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
#     'c': 'Int64'}
#     Use `str` or `object` together with suitable `na_values` settings
#     to preserve and not interpret dtype.
#     If converters are specified, they will be applied INSTEAD
#     of dtype conversion.
# engine : {'c', 'python', 'pyarrow'}, optional
#     Parser engine to use. The C and pyarrow engines are faster, while the python engine
#     is currently more feature-complete. Multithreading is currently only supported by
#     the pyarrow engine.
#
#     .. versionadded:: 1.4.0
#
#         The "pyarrow" engine was added as an *experimental* engine, and some features
#         are unsupported, or may not work correctly, with this engine.
# converters : dict, optional
#     Dict of functions for converting values in certain columns. Keys can either
#     be integers or column labels.
# true_values : list, optional
#     Values to consider as True.
# false_values : list, optional
#     Values to consider as False.
# skipinitialspace : bool, default False
#     Skip spaces after delimiter.
# skiprows : list-like, int or callable, optional
#     Line numbers to skip (0-indexed) or number of lines to skip (int)
#     at the start of the file.
#
#     If callable, the callable function will be evaluated against the row
#     indices, returning True if the row should be skipped and False otherwise.
#     An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
# skipfooter : int, default 0
#     Number of lines at bottom of file to skip (Unsupported with engine='c').
# nrows : int, optional
#     Number of rows of file to read. Useful for reading pieces of large files.
# na_values : scalar, str, list-like, or dict, optional
#     Additional strings to recognize as NA/NaN. If dict passed, specific
#     per-column NA values.  By default the following values are interpreted as
#     NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
#     '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
#     'nan', 'null'.
# keep_default_na : bool, default True
#     Whether or not to include the default NaN values when parsing the data.
#     Depending on whether `na_values` is passed in, the behavior is as follows:
#
#     * If `keep_default_na` is True, and `na_values` are specified, `na_values`
#       is appended to the default NaN values used for parsing.
#     * If `keep_default_na` is True, and `na_values` are not specified, only
#       the default NaN values are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are specified, only
#       the NaN values specified `na_values` are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are not specified, no
#       strings will be parsed as NaN.
#
#     Note that if `na_filter` is passed in as False, the `keep_default_na` and
#     `na_values` parameters will be ignored.
# na_filter : bool, default True
#     Detect missing value markers (empty strings and the value of na_values). In
#     data without any NAs, passing na_filter=False can improve the performance
#     of reading a large file.
# verbose : bool, default False
#     Indicate number of NA values placed in non-numeric columns.
# skip_blank_lines : bool, default True
#     If True, skip over blank lines rather than interpreting as NaN values.
# parse_dates : bool or list of int or names or list of lists or dict, default False
#     The behavior is as follows:
#
#     * boolean. If True -> try parsing the index.
#     * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
#       each as a separate date column.
#     * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
#       a single date column.
#     * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
#       result 'foo'
#
#     If a column or index cannot be represented as an array of datetimes,
#     say because of an unparsable value or a mixture of timezones, the column
#     or index will be returned unaltered as an object data type. For
#     non-standard datetime parsing, use ``pd.to_datetime`` after
#     ``pd.read_csv``. To parse an index or column with a mixture of timezones,
#     specify ``date_parser`` to be a partially-applied
#     :func:`pandas.to_datetime` with ``utc=True``. See
#     :ref:`io.csv.mixed_timezones` for more.
#
#     Note: A fast-path exists for iso8601-formatted dates.
# infer_datetime_format : bool, default False
#     If True and `parse_dates` is enabled, pandas will attempt to infer the
#     format of the datetime strings in the columns, and if it can be inferred,
#     switch to a faster method of parsing them. In some cases this can increase
#     the parsing speed by 5-10x.
# keep_date_col : bool, default False
#     If True and `parse_dates` specifies combining multiple columns then
#     keep the original columns.
# date_parser : function, optional
#     Function to use for converting a sequence of string columns to an array of
#     datetime instances. The default uses ``dateutil.parser.parser`` to do the
#     conversion. Pandas will try to call `date_parser` in three different ways,
#     advancing to the next if an exception occurs: 1) Pass one or more arrays
#     (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
#     string values from the columns defined by `parse_dates` into a single array
#     and pass that; and 3) call `date_parser` once for each row using one or
#     more strings (corresponding to the columns defined by `parse_dates`) as
#     arguments.
# dayfirst : bool, default False
#     DD/MM format dates, international and European format.
# cache_dates : bool, default True
#     If True, use a cache of unique, converted dates to apply the datetime
#     conversion. May produce significant speed-up when parsing duplicate
#     date strings, especially ones with timezone offsets.
#
#     .. versionadded:: 0.25.0
# iterator : bool, default False
#     Return TextFileReader object for iteration or getting chunks with
#     ``get_chunk()``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# chunksize : int, optional
#     Return TextFileReader object for iteration.
#     See the `IO Tools docs
#     <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
#     for more information on ``iterator`` and ``chunksize``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# compression : str or dict, default 'infer'
#     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
#     'zip', the ZIP file must contain only one data file to be read in. Set to
#     ``None`` for no decompression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for Zstandard decompression using a
#     custom compression dictionary:
#     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
#
#     .. versionchanged:: 1.4.0 Zstandard support.
#
# thousands : str, optional
#     Thousands separator.
# decimal : str, default '.'
#     Character to recognize as decimal point (e.g. use ',' for European data).
# lineterminator : str (length 1), optional
#     Character to break file into lines. Only valid with C parser.
# quotechar : str (length 1), optional
#     The character used to denote the start and end of a quoted item. Quoted
#     items can include the delimiter and it will be ignored.
# quoting : int or csv.QUOTE_* instance, default 0
#     Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
#     QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
# doublequote : bool, default ``True``
#    When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
#    whether or not to interpret two consecutive quotechar elements INSIDE a
#    field as a single ``quotechar`` element.
# escapechar : str (length 1), optional
#     One-character string used to escape other characters.
# comment : str, optional
#     Indicates remainder of line should not be parsed. If found at the beginning
#     of a line, the line will be ignored altogether. This parameter must be a
#     single character. Like empty lines (as long as ``skip_blank_lines=True``),
#     fully commented lines are ignored by the parameter `header` but not by
#     `skiprows`. For example, if ``comment='#'``, parsing
#     ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
#     treated as the header.
# encoding : str, optional
#     Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
#     standard encodings
#     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
#
#     .. versionchanged:: 1.2
#
#        When ``encoding`` is ``None``, ``errors="replace"`` is passed to
#        ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
#        This behavior was previously only the case for ``engine="python"``.
#
#     .. versionchanged:: 1.3.0
#
#        ``encoding_errors`` is a new argument. ``encoding`` has no longer an
#        influence on how encoding errors are handled.
#
# encoding_errors : str, optional, default "strict"
#     How encoding errors are treated. `List of possible values
#     <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
#
#     .. versionadded:: 1.3.0
#
# dialect : str or csv.Dialect, optional
#     If provided, this parameter will override values (default or not) for the
#     following parameters: `delimiter`, `doublequote`, `escapechar`,
#     `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
#     override values, a ParserWarning will be issued. See csv.Dialect
#     documentation for more details.
# error_bad_lines : bool, optional, default ``None``
#     Lines with too many fields (e.g. a csv line with too many commas) will by
#     default cause an exception to be raised, and no DataFrame will be returned.
#     If False, then these "bad lines" will be dropped from the DataFrame that is
#     returned.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# warn_bad_lines : bool, optional, default ``None``
#     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
#     "bad line" will be output.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
#     Specifies what to do upon encountering a bad line (a line with too many fields).
#     Allowed values are :
#
#         - 'error', raise an Exception when a bad line is encountered.
#         - 'warn', raise a warning when a bad line is encountered and skip that line.
#         - 'skip', skip bad lines without raising or warning when they are encountered.
#
#     .. versionadded:: 1.3.0
#
#         - callable, function with signature
#           ``(bad_line: list[str]) -> list[str] | None`` that will process a single
#           bad line. ``bad_line`` is a list of strings split by the ``sep``.
#           If the function returns ``None``, the bad line will be ignored.
#           If the function returns a new list of strings with more elements than
#           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
#           Only supported when ``engine="python"``
#
#     .. versionadded:: 1.4.0
#
# delim_whitespace : bool, default False
#     Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
#     used as the sep. Equivalent to setting ``sep='\s+'``. If this option
#     is set to True, nothing should be passed in for the ``delimiter``
#     parameter.
# low_memory : bool, default True
#     Internally process the file in chunks, resulting in lower memory use
#     while parsing, but possibly mixed type inference.  To ensure no mixed
#     types either set False, or specify the type with the `dtype` parameter.
#     Note that the entire file is read into a single DataFrame regardless,
#     use the `chunksize` or `iterator` parameter to return the data in chunks.
#     (Only valid with C parser).
# memory_map : bool, default False
#     If a filepath is provided for `filepath_or_buffer`, map the file object
#     directly onto memory and access the data directly from there. Using this
#     option can improve performance because there is no longer any I/O overhead.
# float_precision : str, optional
#     Specifies which converter the C engine should use for floating-point
#     values. The options are ``None`` or 'high' for the ordinary converter,
#     'legacy' for the original lower precision pandas converter, and
#     'round_trip' for the round-trip converter.
#
#     .. versionchanged:: 1.2
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2
#
# Returns
# -------
# DataFrame or TextParser
#     A comma-separated values (csv) file is returned as two-dimensional
#     data structure with labeled axes.
#
# See Also
# --------
# DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_fwf : Read a table of fixed-width formatted lines into DataFrame.
#
# Examples
# --------
# >>> pd.read_csv('data.csv')  # doctest: +SKIP
#
# </code>
# <a href='#6'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#6'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
test.head(2)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>7. Data Preparation</h1>  <a id='7'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#7'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train.head()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>8. Data Preparation</h1>  <a id='8'></a><small><a href='#top_phases'>back to top</a></small>

# %%
test.shape


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>9. Data Preparation</h1>  <a id='9'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.io.parsers.readers.read_csv</u></summary>
# <blockquote>
# <code>
# Read a comma-separated values (csv) file into DataFrame.
#
# Also supports optionally iterating or breaking of the file
# into chunks.
#
# Additional help can be found in the online docs for
# `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
#
# Parameters
# ----------
# filepath_or_buffer : str, path object or file-like object
#     Any valid string path is acceptable. The string could be a URL. Valid
#     URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
#     expected. A local file could be: file://localhost/path/to/table.csv.
#
#     If you want to pass in a path object, pandas accepts any ``os.PathLike``.
#
#     By file-like object, we refer to objects with a ``read()`` method, such as
#     a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
# sep : str, default ','
#     Delimiter to use. If sep is None, the C engine cannot automatically detect
#     the separator, but the Python parsing engine can, meaning the latter will
#     be used and automatically detect the separator by Python's builtin sniffer
#     tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
#     different from ``'\s+'`` will be interpreted as regular expressions and
#     will also force the use of the Python parsing engine. Note that regex
#     delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
# delimiter : str, default ``None``
#     Alias for sep.
# header : int, list of int, None, default 'infer'
#     Row number(s) to use as the column names, and the start of the
#     data.  Default behavior is to infer the column names: if no names
#     are passed the behavior is identical to ``header=0`` and column
#     names are inferred from the first line of the file, if column
#     names are passed explicitly then the behavior is identical to
#     ``header=None``. Explicitly pass ``header=0`` to be able to
#     replace existing names. The header can be a list of integers that
#     specify row locations for a multi-index on the columns
#     e.g. [0,1,3]. Intervening rows that are not specified will be
#     skipped (e.g. 2 in this example is skipped). Note that this
#     parameter ignores commented lines and empty lines if
#     ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
#     data rather than the first line of the file.
# names : array-like, optional
#     List of column names to use. If the file contains a header row,
#     then you should explicitly pass ``header=0`` to override the column names.
#     Duplicates in this list are not allowed.
# index_col : int, str, sequence of int / str, or False, optional, default ``None``
#   Column(s) to use as the row labels of the ``DataFrame``, either given as
#   string name or column index. If a sequence of int / str is given, a
#   MultiIndex is used.
#
#   Note: ``index_col=False`` can be used to force pandas to *not* use the first
#   column as the index, e.g. when you have a malformed file with delimiters at
#   the end of each line.
# usecols : list-like or callable, optional
#     Return a subset of the columns. If list-like, all elements must either
#     be positional (i.e. integer indices into the document columns) or strings
#     that correspond to column names provided either by the user in `names` or
#     inferred from the document header row(s). If ``names`` are given, the document
#     header row(s) are not taken into account. For example, a valid list-like
#     `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
#     Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
#     To instantiate a DataFrame from ``data`` with element order preserved use
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
#     in ``['foo', 'bar']`` order or
#     ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
#     for ``['bar', 'foo']`` order.
#
#     If callable, the callable function will be evaluated against the column
#     names, returning names where the callable function evaluates to True. An
#     example of a valid callable argument would be ``lambda x: x.upper() in
#     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
#     parsing time and lower memory usage.
# squeeze : bool, default False
#     If the parsed data only contains one column then return a Series.
#
#     .. deprecated:: 1.4.0
#         Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
#         the data.
# prefix : str, optional
#     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
#
#     .. deprecated:: 1.4.0
#        Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
# mangle_dupe_cols : bool, default True
#     Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
#     'X'...'X'. Passing in False will cause data to be overwritten if there
#     are duplicate names in the columns.
# dtype : Type name or dict of column -> type, optional
#     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
#     'c': 'Int64'}
#     Use `str` or `object` together with suitable `na_values` settings
#     to preserve and not interpret dtype.
#     If converters are specified, they will be applied INSTEAD
#     of dtype conversion.
# engine : {'c', 'python', 'pyarrow'}, optional
#     Parser engine to use. The C and pyarrow engines are faster, while the python engine
#     is currently more feature-complete. Multithreading is currently only supported by
#     the pyarrow engine.
#
#     .. versionadded:: 1.4.0
#
#         The "pyarrow" engine was added as an *experimental* engine, and some features
#         are unsupported, or may not work correctly, with this engine.
# converters : dict, optional
#     Dict of functions for converting values in certain columns. Keys can either
#     be integers or column labels.
# true_values : list, optional
#     Values to consider as True.
# false_values : list, optional
#     Values to consider as False.
# skipinitialspace : bool, default False
#     Skip spaces after delimiter.
# skiprows : list-like, int or callable, optional
#     Line numbers to skip (0-indexed) or number of lines to skip (int)
#     at the start of the file.
#
#     If callable, the callable function will be evaluated against the row
#     indices, returning True if the row should be skipped and False otherwise.
#     An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
# skipfooter : int, default 0
#     Number of lines at bottom of file to skip (Unsupported with engine='c').
# nrows : int, optional
#     Number of rows of file to read. Useful for reading pieces of large files.
# na_values : scalar, str, list-like, or dict, optional
#     Additional strings to recognize as NA/NaN. If dict passed, specific
#     per-column NA values.  By default the following values are interpreted as
#     NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
#     '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
#     'nan', 'null'.
# keep_default_na : bool, default True
#     Whether or not to include the default NaN values when parsing the data.
#     Depending on whether `na_values` is passed in, the behavior is as follows:
#
#     * If `keep_default_na` is True, and `na_values` are specified, `na_values`
#       is appended to the default NaN values used for parsing.
#     * If `keep_default_na` is True, and `na_values` are not specified, only
#       the default NaN values are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are specified, only
#       the NaN values specified `na_values` are used for parsing.
#     * If `keep_default_na` is False, and `na_values` are not specified, no
#       strings will be parsed as NaN.
#
#     Note that if `na_filter` is passed in as False, the `keep_default_na` and
#     `na_values` parameters will be ignored.
# na_filter : bool, default True
#     Detect missing value markers (empty strings and the value of na_values). In
#     data without any NAs, passing na_filter=False can improve the performance
#     of reading a large file.
# verbose : bool, default False
#     Indicate number of NA values placed in non-numeric columns.
# skip_blank_lines : bool, default True
#     If True, skip over blank lines rather than interpreting as NaN values.
# parse_dates : bool or list of int or names or list of lists or dict, default False
#     The behavior is as follows:
#
#     * boolean. If True -> try parsing the index.
#     * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
#       each as a separate date column.
#     * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
#       a single date column.
#     * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
#       result 'foo'
#
#     If a column or index cannot be represented as an array of datetimes,
#     say because of an unparsable value or a mixture of timezones, the column
#     or index will be returned unaltered as an object data type. For
#     non-standard datetime parsing, use ``pd.to_datetime`` after
#     ``pd.read_csv``. To parse an index or column with a mixture of timezones,
#     specify ``date_parser`` to be a partially-applied
#     :func:`pandas.to_datetime` with ``utc=True``. See
#     :ref:`io.csv.mixed_timezones` for more.
#
#     Note: A fast-path exists for iso8601-formatted dates.
# infer_datetime_format : bool, default False
#     If True and `parse_dates` is enabled, pandas will attempt to infer the
#     format of the datetime strings in the columns, and if it can be inferred,
#     switch to a faster method of parsing them. In some cases this can increase
#     the parsing speed by 5-10x.
# keep_date_col : bool, default False
#     If True and `parse_dates` specifies combining multiple columns then
#     keep the original columns.
# date_parser : function, optional
#     Function to use for converting a sequence of string columns to an array of
#     datetime instances. The default uses ``dateutil.parser.parser`` to do the
#     conversion. Pandas will try to call `date_parser` in three different ways,
#     advancing to the next if an exception occurs: 1) Pass one or more arrays
#     (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
#     string values from the columns defined by `parse_dates` into a single array
#     and pass that; and 3) call `date_parser` once for each row using one or
#     more strings (corresponding to the columns defined by `parse_dates`) as
#     arguments.
# dayfirst : bool, default False
#     DD/MM format dates, international and European format.
# cache_dates : bool, default True
#     If True, use a cache of unique, converted dates to apply the datetime
#     conversion. May produce significant speed-up when parsing duplicate
#     date strings, especially ones with timezone offsets.
#
#     .. versionadded:: 0.25.0
# iterator : bool, default False
#     Return TextFileReader object for iteration or getting chunks with
#     ``get_chunk()``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# chunksize : int, optional
#     Return TextFileReader object for iteration.
#     See the `IO Tools docs
#     <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
#     for more information on ``iterator`` and ``chunksize``.
#
#     .. versionchanged:: 1.2
#
#        ``TextFileReader`` is a context manager.
# compression : str or dict, default 'infer'
#     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
#     'zip', the ZIP file must contain only one data file to be read in. Set to
#     ``None`` for no decompression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for Zstandard decompression using a
#     custom compression dictionary:
#     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
#
#     .. versionchanged:: 1.4.0 Zstandard support.
#
# thousands : str, optional
#     Thousands separator.
# decimal : str, default '.'
#     Character to recognize as decimal point (e.g. use ',' for European data).
# lineterminator : str (length 1), optional
#     Character to break file into lines. Only valid with C parser.
# quotechar : str (length 1), optional
#     The character used to denote the start and end of a quoted item. Quoted
#     items can include the delimiter and it will be ignored.
# quoting : int or csv.QUOTE_* instance, default 0
#     Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
#     QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
# doublequote : bool, default ``True``
#    When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
#    whether or not to interpret two consecutive quotechar elements INSIDE a
#    field as a single ``quotechar`` element.
# escapechar : str (length 1), optional
#     One-character string used to escape other characters.
# comment : str, optional
#     Indicates remainder of line should not be parsed. If found at the beginning
#     of a line, the line will be ignored altogether. This parameter must be a
#     single character. Like empty lines (as long as ``skip_blank_lines=True``),
#     fully commented lines are ignored by the parameter `header` but not by
#     `skiprows`. For example, if ``comment='#'``, parsing
#     ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
#     treated as the header.
# encoding : str, optional
#     Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
#     standard encodings
#     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
#
#     .. versionchanged:: 1.2
#
#        When ``encoding`` is ``None``, ``errors="replace"`` is passed to
#        ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
#        This behavior was previously only the case for ``engine="python"``.
#
#     .. versionchanged:: 1.3.0
#
#        ``encoding_errors`` is a new argument. ``encoding`` has no longer an
#        influence on how encoding errors are handled.
#
# encoding_errors : str, optional, default "strict"
#     How encoding errors are treated. `List of possible values
#     <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
#
#     .. versionadded:: 1.3.0
#
# dialect : str or csv.Dialect, optional
#     If provided, this parameter will override values (default or not) for the
#     following parameters: `delimiter`, `doublequote`, `escapechar`,
#     `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
#     override values, a ParserWarning will be issued. See csv.Dialect
#     documentation for more details.
# error_bad_lines : bool, optional, default ``None``
#     Lines with too many fields (e.g. a csv line with too many commas) will by
#     default cause an exception to be raised, and no DataFrame will be returned.
#     If False, then these "bad lines" will be dropped from the DataFrame that is
#     returned.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# warn_bad_lines : bool, optional, default ``None``
#     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
#     "bad line" will be output.
#
#     .. deprecated:: 1.3.0
#        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
#        encountering a bad line instead.
# on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
#     Specifies what to do upon encountering a bad line (a line with too many fields).
#     Allowed values are :
#
#         - 'error', raise an Exception when a bad line is encountered.
#         - 'warn', raise a warning when a bad line is encountered and skip that line.
#         - 'skip', skip bad lines without raising or warning when they are encountered.
#
#     .. versionadded:: 1.3.0
#
#         - callable, function with signature
#           ``(bad_line: list[str]) -> list[str] | None`` that will process a single
#           bad line. ``bad_line`` is a list of strings split by the ``sep``.
#           If the function returns ``None``, the bad line will be ignored.
#           If the function returns a new list of strings with more elements than
#           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
#           Only supported when ``engine="python"``
#
#     .. versionadded:: 1.4.0
#
# delim_whitespace : bool, default False
#     Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
#     used as the sep. Equivalent to setting ``sep='\s+'``. If this option
#     is set to True, nothing should be passed in for the ``delimiter``
#     parameter.
# low_memory : bool, default True
#     Internally process the file in chunks, resulting in lower memory use
#     while parsing, but possibly mixed type inference.  To ensure no mixed
#     types either set False, or specify the type with the `dtype` parameter.
#     Note that the entire file is read into a single DataFrame regardless,
#     use the `chunksize` or `iterator` parameter to return the data in chunks.
#     (Only valid with C parser).
# memory_map : bool, default False
#     If a filepath is provided for `filepath_or_buffer`, map the file object
#     directly onto memory and access the data directly from there. Using this
#     option can improve performance because there is no longer any I/O overhead.
# float_precision : str, optional
#     Specifies which converter the C engine should use for floating-point
#     values. The options are ``None`` or 'high' for the ordinary converter,
#     'legacy' for the original lower precision pandas converter, and
#     'round_trip' for the round-trip converter.
#
#     .. versionchanged:: 1.2
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2
#
# Returns
# -------
# DataFrame or TextParser
#     A comma-separated values (csv) file is returned as two-dimensional
#     data structure with labeled axes.
#
# See Also
# --------
# DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_fwf : Read a table of fixed-width formatted lines into DataFrame.
#
# Examples
# --------
# >>> pd.read_csv('data.csv')  # doctest: +SKIP
#
# </code>
# <a href='#9'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#9'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
submission.head(2)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>10. Data Preparation</h1>  <a id='10'></a><small><a href='#top_phases'>back to top</a></small>

# %%
submission.shape


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>11. Data Preparation</h1>  <a id='11'></a><small><a href='#top_phases'>back to top</a></small>

# %%
train.shape


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>12. Data Preparation</h1>  <a id='12'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.isin</u></summary>
# <blockquote>
# <code>
# Whether elements in Series are contained in `values`.
#
# Return a boolean Series showing whether each element in the Series
# matches an element in the passed sequence of `values` exactly.
#
# Parameters
# ----------
# values : set or list-like
#     The sequence of values to test. Passing in a single string will
#     raise a ``TypeError``. Instead, turn a single string into a
#     list of one element.
#
# Returns
# -------
# Series
#     Series of booleans indicating if each element is in values.
#
# Raises
# ------
# TypeError
#   * If `values` is a string
#
# See Also
# --------
# DataFrame.isin : Equivalent method on DataFrame.
#
# Examples
# --------
# >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
# ...                'hippo'], name='animal')
# >>> s.isin(['cow', 'lama'])
# 0     True
# 1     True
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# To invert the boolean values, use the ``~`` operator:
#
# >>> ~s.isin(['cow', 'lama'])
# 0    False
# 1    False
# 2    False
# 3     True
# 4    False
# 5     True
# Name: animal, dtype: bool
#
# Passing a single string as ``s.isin('lama')`` will raise an error. Use
# a list of one element instead:
#
# >>> s.isin(['lama'])
# 0     True
# 1    False
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# Strings and integers are distinct and are therefore not comparable:
#
# >>> pd.Series([1]).isin(['1'])
# 0    False
# dtype: bool
# >>> pd.Series([1.1]).isin(['1.1'])
# 0    False
# dtype: bool
#
# </code>
# <a href='#12'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train = train[train.item_id.isin (test.item_id)]
train = train[train.shop_id.isin (test.shop_id)]


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>13. Data Preparation</h1>  <a id='13'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.info</u></summary>
# <blockquote>
# <code>
# Print a concise summary of a Series.
#
# This method prints information about a Series including
# the index dtype, non-null values and memory usage.
#
# .. versionadded:: 1.4.0
#
# Parameters
# ----------
# data : Series
#     Series to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the Series
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
#
# Returns
# -------
# None
#     This method prints a summary of a Series and returns None.
#
# See Also
# --------
# Series.describe: Generate descriptive statistics of Series.
# Series.memory_usage: Memory usage of Series.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> s = pd.Series(text_values, index=int_values)
# >>> s.info()
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# Series name: None
# Non-Null Count  Dtype
# --------------  -----
# 5 non-null      object
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Prints a summary excluding information about its values:
#
# >>> s.info(verbose=False)
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Pipe output of Series.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> s.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big Series and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
# >>> s.info()
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 7.6+ MB
#
# >>> s.info(memory_usage='deep')
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 55.3 MB
#
# </code>
# <a href='#13'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.info</u></summary>
# <blockquote>
# <code>
# Print a concise summary of a DataFrame.
#
# This method prints information about a DataFrame including
# the index dtype and columns, non-null values and memory usage.
#
# Parameters
# ----------
# data : DataFrame
#     DataFrame to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    max_cols : int, optional
#     When to switch from the verbose to the truncated output. If the
#     DataFrame has more than `max_cols` columns, the truncated output
#     is used. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is used.
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the DataFrame
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
# null_counts : bool, optional
#     .. deprecated:: 1.2.0
#         Use show_counts instead.
#
# Returns
# -------
# None
#     This method prints a summary of a DataFrame and returns None.
#
# See Also
# --------
# DataFrame.describe: Generate descriptive statistics of DataFrame
#     columns.
# DataFrame.memory_usage: Memory usage of DataFrame columns.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
# >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
# ...                   "float_col": float_values})
# >>> df
#     int_col text_col  float_col
# 0        1    alpha       0.00
# 1        2     beta       0.25
# 2        3    gamma       0.50
# 3        4    delta       0.75
# 4        5  epsilon       1.00
#
# Prints information of all columns:
#
# >>> df.info(verbose=True)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Data columns (total 3 columns):
#  #   Column     Non-Null Count  Dtype
# ---  ------     --------------  -----
#  0   int_col    5 non-null      int64
#  1   text_col   5 non-null      object
#  2   float_col  5 non-null      float64
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Prints a summary of columns count and its dtypes but not per column
# information:
#
# >>> df.info(verbose=False)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Columns: 3 entries, int_col to float_col
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Pipe output of DataFrame.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> df.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big DataFrames and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> df = pd.DataFrame({
# ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
# ... })
# >>> df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 22.9+ MB
#
# >>> df.info(memory_usage='deep')
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 165.9 MB
#
# </code>
# <a href='#13'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train.info()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>14. Data Preparation</h1>  <a id='14'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#14'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train.head()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>15. Data Preparation | Feature Engineering</h1>  <a id='15'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u></summary>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#15'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u></summary>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#15'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train.drop(['date'],axis=1,inplace=True)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>16. Data Preparation</h1>  <a id='16'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#16'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
test.head()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>17. Data Preparation</h1>  <a id='17'></a><small><a href='#top_phases'>back to top</a></small>

# %%
train['date_block_num']


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>18. Data Preparation</h1>  <a id='18'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#18'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
test['date_block_num'] = 34
test = test[['date_block_num','shop_id','item_id']]
test.head(2)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>19. Data Preparation | Feature Engineering</h1>  <a id='19'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.groupby</u></summary>
# <blockquote>
# <code>
# Group DataFrame using a mapper or by a Series of columns.
#
# A groupby operation involves some combination of splitting the
# object, applying a function, and combining the results. This can be
# used to group large amounts of data and compute operations on these
# groups.
#
# Parameters
# ----------
# by : mapping, function, label, or list of labels
#     Used to determine the groups for the groupby.
#     If ``by`` is a function, it's called on each value of the object's
#     index. If a dict or Series is passed, the Series or dict VALUES
#     will be used to determine the groups (the Series' values are first
#     aligned; see ``.align()`` method). If a list or ndarray of length
#     equal to the selected axis is passed (see the `groupby user guide
#     <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
#     the values are used as-is to determine the groups. A label or list
#     of labels may be passed to group by the columns in ``self``.
#     Notice that a tuple is interpreted as a (single) key.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Split along rows (0) or columns (1).
# level : int, level name, or sequence of such, default None
#     If the axis is a MultiIndex (hierarchical), group by a particular
#     level or levels.
# as_index : bool, default True
#     For aggregated output, return object with group labels as the
#     index. Only relevant for DataFrame input. as_index=False is
#     effectively "SQL-style" grouped output.
# sort : bool, default True
#     Sort group keys. Get better performance by turning this off.
#     Note this does not influence the order of observations within each
#     group. Groupby preserves the order of rows within each group.
# group_keys : bool, default True
#     When calling apply, add group keys to index to identify pieces.
# squeeze : bool, default False
#     Reduce the dimensionality of the return type if possible,
#     otherwise return a consistent type.
#
#     .. deprecated:: 1.1.0
#
# observed : bool, default False
#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.
# dropna : bool, default True
#     If True, and if group keys contain NA values, NA values together
#     with row/column will be dropped.
#     If False, NA values will also be treated as the key in groups.
#
#     .. versionadded:: 1.1.0
#
# Returns
# -------
# DataFrameGroupBy
#     Returns a groupby object that contains information about the groups.
#
# See Also
# --------
# resample : Convenience method for frequency conversion and resampling
#     of time series.
#
# Notes
# -----
# See the `user guide
# <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
# detailed usage and examples, including splitting an object into groups,
# iterating through groups, selecting a group, aggregation, and more.
#
# Examples
# --------
# >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
# ...                               'Parrot', 'Parrot'],
# ...                    'Max Speed': [380., 370., 24., 26.]})
# >>> df
#    Animal  Max Speed
# 0  Falcon      380.0
# 1  Falcon      370.0
# 2  Parrot       24.0
# 3  Parrot       26.0
# >>> df.groupby(['Animal']).mean()
#         Max Speed
# Animal
# Falcon      375.0
# Parrot       25.0
#
# **Hierarchical Indexes**
#
# We can groupby different levels of a hierarchical index
# using the `level` parameter:
#
# >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
# ...           ['Captive', 'Wild', 'Captive', 'Wild']]
# >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
# >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
# ...                   index=index)
# >>> df
#                 Max Speed
# Animal Type
# Falcon Captive      390.0
#        Wild         350.0
# Parrot Captive       30.0
#        Wild          20.0
# >>> df.groupby(level=0).mean()
#         Max Speed
# Animal
# Falcon      370.0
# Parrot       25.0
# >>> df.groupby(level="Type").mean()
#          Max Speed
# Type
# Captive      210.0
# Wild         185.0
#
# We can also choose to include NA in group keys or not by setting
# `dropna` parameter, the default setting is `True`.
#
# >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by=["b"]).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
#
# >>> df.groupby(by=["b"], dropna=False).sum()
#     a   c
# b
# 1.0 2   3
# 2.0 2   5
# NaN 1   4
#
# >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
# >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
#
# >>> df.groupby(by="a").sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
#
# >>> df.groupby(by="a", dropna=False).sum()
#     b     c
# a
# a   13.0   13.0
# b   12.3  123.0
# NaN 12.3   33.0
#
# </code>
# <a href='#19'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.groupby</u></summary>
# <blockquote>
# <code>
# Group Series using a mapper or by a Series of columns.
#
# A groupby operation involves some combination of splitting the
# object, applying a function, and combining the results. This can be
# used to group large amounts of data and compute operations on these
# groups.
#
# Parameters
# ----------
# by : mapping, function, label, or list of labels
#     Used to determine the groups for the groupby.
#     If ``by`` is a function, it's called on each value of the object's
#     index. If a dict or Series is passed, the Series or dict VALUES
#     will be used to determine the groups (the Series' values are first
#     aligned; see ``.align()`` method). If a list or ndarray of length
#     equal to the selected axis is passed (see the `groupby user guide
#     <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
#     the values are used as-is to determine the groups. A label or list
#     of labels may be passed to group by the columns in ``self``.
#     Notice that a tuple is interpreted as a (single) key.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Split along rows (0) or columns (1).
# level : int, level name, or sequence of such, default None
#     If the axis is a MultiIndex (hierarchical), group by a particular
#     level or levels.
# as_index : bool, default True
#     For aggregated output, return object with group labels as the
#     index. Only relevant for DataFrame input. as_index=False is
#     effectively "SQL-style" grouped output.
# sort : bool, default True
#     Sort group keys. Get better performance by turning this off.
#     Note this does not influence the order of observations within each
#     group. Groupby preserves the order of rows within each group.
# group_keys : bool, default True
#     When calling apply, add group keys to index to identify pieces.
# squeeze : bool, default False
#     Reduce the dimensionality of the return type if possible,
#     otherwise return a consistent type.
#
#     .. deprecated:: 1.1.0
#
# observed : bool, default False
#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.
# dropna : bool, default True
#     If True, and if group keys contain NA values, NA values together
#     with row/column will be dropped.
#     If False, NA values will also be treated as the key in groups.
#
#     .. versionadded:: 1.1.0
#
# Returns
# -------
# SeriesGroupBy
#     Returns a groupby object that contains information about the groups.
#
# See Also
# --------
# resample : Convenience method for frequency conversion and resampling
#     of time series.
#
# Notes
# -----
# See the `user guide
# <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
# detailed usage and examples, including splitting an object into groups,
# iterating through groups, selecting a group, aggregation, and more.
#
# Examples
# --------
# >>> ser = pd.Series([390., 350., 30., 20.],
# ...                 index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed")
# >>> ser
# Falcon    390.0
# Falcon    350.0
# Parrot     30.0
# Parrot     20.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(["a", "b", "a", "b"]).mean()
# a    210.0
# b    185.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(level=0).mean()
# Falcon    370.0
# Parrot     25.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(ser > 100).mean()
# Max Speed
# False     25.0
# True     370.0
# Name: Max Speed, dtype: float64
#
# **Grouping by Indexes**
#
# We can groupby different levels of a hierarchical index
# using the `level` parameter:
#
# >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
# ...           ['Captive', 'Wild', 'Captive', 'Wild']]
# >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
# >>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed")
# >>> ser
# Animal  Type
# Falcon  Captive    390.0
#         Wild       350.0
# Parrot  Captive     30.0
#         Wild        20.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(level=0).mean()
# Animal
# Falcon    370.0
# Parrot     25.0
# Name: Max Speed, dtype: float64
# >>> ser.groupby(level="Type").mean()
# Type
# Captive    210.0
# Wild       185.0
# Name: Max Speed, dtype: float64
#
# We can also choose to include `NA` in group keys or not by defining
# `dropna` parameter, the default setting is `True`.
#
# >>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan])
# >>> ser.groupby(level=0).sum()
# a    3
# b    3
# dtype: int64
#
# >>> ser.groupby(level=0, dropna=False).sum()
# a    3
# b    3
# NaN  3
# dtype: int64
#
# >>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
# >>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
# >>> ser.groupby(["a", "b", "a", np.nan]).mean()
# a    210.0
# b    350.0
# Name: Max Speed, dtype: float64
#
# >>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()
# a    210.0
# b    350.0
# NaN   20.0
# Name: Max Speed, dtype: float64
#
# </code>
# <a href='#19'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
item_price = dict(train.groupby('item_id')['item_price'].last().reset_index().values)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>20. Data Preparation | Feature Engineering</h1>  <a id='20'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u></summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#20'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#20'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
test['item_price'] = test.item_id.map(item_price)
test.head()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>21. Data Preparation | Feature Engineering</h1>  <a id='21'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.isnull</u></summary>
# <blockquote>
# <code>
# DataFrame.isnull is an alias for DataFrame.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# DataFrame
#     Mask of bool values for each element in DataFrame that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# DataFrame.isnull : Alias of isna.
# DataFrame.notna : Boolean inverse of isna.
# DataFrame.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#21'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u></summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#21'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
test.isnull().sum()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>22. Data Preparation</h1>  <a id='22'></a><small><a href='#top_phases'>back to top</a></small>

# %%
train.shape, test.shape


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>23. Data Preparation</h1>  <a id='23'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.isin</u></summary>
# <blockquote>
# <code>
# Whether elements in Series are contained in `values`.
#
# Return a boolean Series showing whether each element in the Series
# matches an element in the passed sequence of `values` exactly.
#
# Parameters
# ----------
# values : set or list-like
#     The sequence of values to test. Passing in a single string will
#     raise a ``TypeError``. Instead, turn a single string into a
#     list of one element.
#
# Returns
# -------
# Series
#     Series of booleans indicating if each element is in values.
#
# Raises
# ------
# TypeError
#   * If `values` is a string
#
# See Also
# --------
# DataFrame.isin : Equivalent method on DataFrame.
#
# Examples
# --------
# >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
# ...                'hippo'], name='animal')
# >>> s.isin(['cow', 'lama'])
# 0     True
# 1     True
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# To invert the boolean values, use the ``~`` operator:
#
# >>> ~s.isin(['cow', 'lama'])
# 0    False
# 1    False
# 2    False
# 3     True
# 4    False
# 5     True
# Name: animal, dtype: bool
#
# Passing a single string as ``s.isin('lama')`` will raise an error. Use
# a list of one element instead:
#
# >>> s.isin(['lama'])
# 0     True
# 1    False
# 2     True
# 3    False
# 4     True
# 5    False
# Name: animal, dtype: bool
#
# Strings and integers are distinct and are therefore not comparable:
#
# >>> pd.Series([1]).isin(['1'])
# 0    False
# dtype: bool
# >>> pd.Series([1.1]).isin(['1.1'])
# 0    False
# dtype: bool
#
# </code>
# <a href='#23'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train = train[train.item_id.isin (test.item_id)]
train = train[train.shop_id.isin (test.shop_id)]


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>24. Data Preparation</h1>  <a id='24'></a><small><a href='#top_phases'>back to top</a></small>

# %%
train.shape, test.shape


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>25. Data Preparation | Feature Engineering</h1>  <a id='25'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.isnull</u></summary>
# <blockquote>
# <code>
# DataFrame.isnull is an alias for DataFrame.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# DataFrame
#     Mask of bool values for each element in DataFrame that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# DataFrame.isnull : Alias of isna.
# DataFrame.notna : Boolean inverse of isna.
# DataFrame.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#25'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u></summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#25'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
test.isnull().sum()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>26. Feature Engineering</h1>  <a id='26'></a><small><a href='#top_phases'>back to top</a></small>

# %%
train['shop*item'] = train.shop_id *train.item_id
test['shop*item'] = test.shop_id *test.item_id


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>27. Data Preparation | Feature Engineering</h1>  <a id='27'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#27'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u></summary>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#27'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
item.head()
item.drop('item_name',axis=1,inplace = True)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>28. Data Preparation | Feature Engineering</h1>  <a id='28'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.map</u></summary>
# <blockquote>
# <code>
# Map values of Series according to an input mapping or function.
#
# Used for substituting each value in a Series with another value,
# that may be derived from a function, a ``dict`` or
# a :class:`Series`.
#
# Parameters
# ----------
# arg : function, collections.abc.Mapping subclass or Series
#     Mapping correspondence.
# na_action : {None, 'ignore'}, default None
#     If 'ignore', propagate NaN values, without passing them to the
#     mapping correspondence.
#
# Returns
# -------
# Series
#     Same index as caller.
#
# See Also
# --------
# Series.apply : For applying more complex functions on a Series.
# DataFrame.apply : Apply a function row-/column-wise.
# DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
#
# Notes
# -----
# When ``arg`` is a dictionary, values in Series that are not in the
# dictionary (as keys) are converted to ``NaN``. However, if the
# dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
# provides a method for default values), then this default is used
# rather than ``NaN``.
#
# Examples
# --------
# >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
# >>> s
# 0      cat
# 1      dog
# 2      NaN
# 3   rabbit
# dtype: object
#
# ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
# in the ``dict`` are converted to ``NaN``, unless the dict has a default
# value (e.g. ``defaultdict``):
#
# >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
# 0   kitten
# 1    puppy
# 2      NaN
# 3      NaN
# dtype: object
#
# It also accepts a function:
#
# >>> s.map('I am a {}'.format)
# 0       I am a cat
# 1       I am a dog
# 2       I am a nan
# 3    I am a rabbit
# dtype: object
#
# To avoid applying the function to missing values (and keep them as
# ``NaN``) ``na_action='ignore'`` can be used:
#
# >>> s.map('I am a {}'.format, na_action='ignore')
# 0     I am a cat
# 1     I am a dog
# 2            NaN
# 3  I am a rabbit
# dtype: object
#
# </code>
# <a href='#28'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
item_cat = dict(item.values)
train['item_cat'] = train.item_id.map(item_cat)
test['item_cat'] = test.item_id.map(item_cat)



# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>29. Data Preparation</h1>  <a id='29'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.head</u></summary>
# <blockquote>
# <code>
# Return the first `n` rows.
#
# This function returns the first `n` rows for the object based
# on position. It is useful for quickly testing if your object
# has the right type of data in it.
#
# For negative values of `n`, this function returns all rows except
# the last `n` rows, equivalent to ``df[:-n]``.
#
# Parameters
# ----------
# n : int, default 5
#     Number of rows to select.
#
# Returns
# -------
# same type as caller
#     The first `n` rows of the caller object.
#
# See Also
# --------
# DataFrame.tail: Returns the last `n` rows.
#
# Examples
# --------
# >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
# ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
# >>> df
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
# 6      shark
# 7      whale
# 8      zebra
#
# Viewing the first 5 lines
#
# >>> df.head()
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
#
# Viewing the first `n` lines (three in this case)
#
# >>> df.head(3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
#
# For negative values of `n`
#
# >>> df.head(-3)
#       animal
# 0  alligator
# 1        bee
# 2     falcon
# 3       lion
# 4     monkey
# 5     parrot
#
# </code>
# <a href='#29'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train.head(2)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>30. Data Preparation</h1>  <a id='30'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.info</u></summary>
# <blockquote>
# <code>
# Print a concise summary of a Series.
#
# This method prints information about a Series including
# the index dtype, non-null values and memory usage.
#
# .. versionadded:: 1.4.0
#
# Parameters
# ----------
# data : Series
#     Series to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the Series
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
#
# Returns
# -------
# None
#     This method prints a summary of a Series and returns None.
#
# See Also
# --------
# Series.describe: Generate descriptive statistics of Series.
# Series.memory_usage: Memory usage of Series.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> s = pd.Series(text_values, index=int_values)
# >>> s.info()
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# Series name: None
# Non-Null Count  Dtype
# --------------  -----
# 5 non-null      object
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Prints a summary excluding information about its values:
#
# >>> s.info(verbose=False)
# <class 'pandas.core.series.Series'>
# Int64Index: 5 entries, 1 to 5
# dtypes: object(1)
# memory usage: 80.0+ bytes
#
# Pipe output of Series.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> s.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big Series and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
# >>> s.info()
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 7.6+ MB
#
# >>> s.info(memory_usage='deep')
# <class 'pandas.core.series.Series'>
# RangeIndex: 1000000 entries, 0 to 999999
# Series name: None
# Non-Null Count    Dtype
# --------------    -----
# 1000000 non-null  object
# dtypes: object(1)
# memory usage: 55.3 MB
#
# </code>
# <a href='#30'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.info</u></summary>
# <blockquote>
# <code>
# Print a concise summary of a DataFrame.
#
# This method prints information about a DataFrame including
# the index dtype and columns, non-null values and memory usage.
#
# Parameters
# ----------
# data : DataFrame
#     DataFrame to print information about.
# verbose : bool, optional
#     Whether to print the full summary. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is followed.
# buf : writable buffer, defaults to sys.stdout
#     Where to send the output. By default, the output is printed to
#     sys.stdout. Pass a writable buffer if you need to further process
#     the output.    max_cols : int, optional
#     When to switch from the verbose to the truncated output. If the
#     DataFrame has more than `max_cols` columns, the truncated output
#     is used. By default, the setting in
#     ``pandas.options.display.max_info_columns`` is used.
# memory_usage : bool, str, optional
#     Specifies whether total memory usage of the DataFrame
#     elements (including the index) should be displayed. By default,
#     this follows the ``pandas.options.display.memory_usage`` setting.
#
#     True always show memory usage. False never shows memory usage.
#     A value of 'deep' is equivalent to "True with deep introspection".
#     Memory usage is shown in human-readable units (base-2
#     representation). Without deep introspection a memory estimation is
#     made based in column dtype and number of rows assuming values
#     consume the same memory amount for corresponding dtypes. With deep
#     memory introspection, a real memory usage calculation is performed
#     at the cost of computational resources.
# show_counts : bool, optional
#     Whether to show the non-null counts. By default, this is shown
#     only if the DataFrame is smaller than
#     ``pandas.options.display.max_info_rows`` and
#     ``pandas.options.display.max_info_columns``. A value of True always
#     shows the counts, and False never shows the counts.
# null_counts : bool, optional
#     .. deprecated:: 1.2.0
#         Use show_counts instead.
#
# Returns
# -------
# None
#     This method prints a summary of a DataFrame and returns None.
#
# See Also
# --------
# DataFrame.describe: Generate descriptive statistics of DataFrame
#     columns.
# DataFrame.memory_usage: Memory usage of DataFrame columns.
#
# Examples
# --------
# >>> int_values = [1, 2, 3, 4, 5]
# >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
# >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
# >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
# ...                   "float_col": float_values})
# >>> df
#     int_col text_col  float_col
# 0        1    alpha       0.00
# 1        2     beta       0.25
# 2        3    gamma       0.50
# 3        4    delta       0.75
# 4        5  epsilon       1.00
#
# Prints information of all columns:
#
# >>> df.info(verbose=True)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Data columns (total 3 columns):
#  #   Column     Non-Null Count  Dtype
# ---  ------     --------------  -----
#  0   int_col    5 non-null      int64
#  1   text_col   5 non-null      object
#  2   float_col  5 non-null      float64
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Prints a summary of columns count and its dtypes but not per column
# information:
#
# >>> df.info(verbose=False)
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Columns: 3 entries, int_col to float_col
# dtypes: float64(1), int64(1), object(1)
# memory usage: 248.0+ bytes
#
# Pipe output of DataFrame.info to buffer instead of sys.stdout, get
# buffer content and writes to a text file:
#
# >>> import io
# >>> buffer = io.StringIO()
# >>> df.info(buf=buffer)
# >>> s = buffer.getvalue()
# >>> with open("df_info.txt", "w",
# ...           encoding="utf-8") as f:  # doctest: +SKIP
# ...     f.write(s)
# 260
#
# The `memory_usage` parameter allows deep introspection mode, specially
# useful for big DataFrames and fine-tune memory optimization:
#
# >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
# >>> df = pd.DataFrame({
# ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
# ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
# ... })
# >>> df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 22.9+ MB
#
# >>> df.info(memory_usage='deep')
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1000000 entries, 0 to 999999
# Data columns (total 3 columns):
#  #   Column    Non-Null Count    Dtype
# ---  ------    --------------    -----
#  0   column_1  1000000 non-null  object
#  1   column_2  1000000 non-null  object
#  2   column_3  1000000 non-null  object
# dtypes: object(3)
# memory usage: 165.9 MB
#
# </code>
# <a href='#30'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train.info()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>31. Library Loading</h1>  <a id='31'></a><small><a href='#top_phases'>back to top</a></small>

# %%
import seaborn as sns
import matplotlib.pyplot as plt


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>32. Data Preparation | Feature Engineering</h1>  <a id='32'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u></summary>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#32'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
df = pd.concat([train,test])


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>33. Visualization</h1>  <a id='33'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>seaborn</h2>
# <ul>
# <li>
# <details><summary><u>seaborn.distributions.histplot</u></summary>
# <blockquote>
# <code>
# Plot univariate or bivariate histograms to show distributions of datasets.
#
# A histogram is a classic visualization tool that represents the distribution
# of one or more variables by counting the number of observations that fall within
# disrete bins.
#
# This function can normalize the statistic computed within each bin to estimate
# frequency, density or probability mass, and it can add a smooth curve obtained
# using a kernel density estimate, similar to :func:`kdeplot`.
#
# More information is provided in the :ref:`user guide <tutorial_hist>`.
#
# Parameters
# ----------
# data : :class:`pandas.DataFrame`, :class:`numpy.ndarray`, mapping, or sequence
#     Input data structure. Either a long-form collection of vectors that can be
#     assigned to named variables or a wide-form dataset that will be internally
#     reshaped.
# x, y : vectors or keys in ``data``
#     Variables that specify positions on the x and y axes.
# hue : vector or key in ``data``
#     Semantic variable that is mapped to determine the color of plot elements.
# weights : vector or key in ``data``
#     If provided, weight the contribution of the corresponding data points
#     towards the count in each bin by these factors.
# stat : str
#     Aggregate statistic to compute in each bin.
#     
#     - `count`: show the number of observations in each bin
#     - `frequency`: show the number of observations divided by the bin width
#     - `probability`: or `proportion`: normalize such that bar heights sum to 1
#     - `percent`: normalize such that bar heights sum to 100
#     - `density`: normalize such that the total area of the histogram equals 1
# bins : str, number, vector, or a pair of such values
#     Generic bin parameter that can be the name of a reference rule,
#     the number of bins, or the breaks of the bins.
#     Passed to :func:`numpy.histogram_bin_edges`.
# binwidth : number or pair of numbers
#     Width of each bin, overrides ``bins`` but can be used with
#     ``binrange``.
# binrange : pair of numbers or a pair of pairs
#     Lowest and highest value for bin edges; can be used either
#     with ``bins`` or ``binwidth``. Defaults to data extremes.
# discrete : bool
#     If True, default to ``binwidth=1`` and draw the bars so that they are
#     centered on their corresponding data points. This avoids "gaps" that may
#     otherwise appear when using discrete (integer) data.
# cumulative : bool
#     If True, plot the cumulative counts as bins increase.
# common_bins : bool
#     If True, use the same bins when semantic variables produce multiple
#     plots. If using a reference rule to determine the bins, it will be computed
#     with the full dataset.
# common_norm : bool
#     If True and using a normalized statistic, the normalization will apply over
#     the full dataset. Otherwise, normalize each histogram independently.
# multiple : {"layer", "dodge", "stack", "fill"}
#     Approach to resolving multiple elements when semantic mapping creates subsets.
#     Only relevant with univariate data.
# element : {"bars", "step", "poly"}
#     Visual representation of the histogram statistic.
#     Only relevant with univariate data.
# fill : bool
#     If True, fill in the space under the histogram.
#     Only relevant with univariate data.
# shrink : number
#     Scale the width of each bar relative to the binwidth by this factor.
#     Only relevant with univariate data.
# kde : bool
#     If True, compute a kernel density estimate to smooth the distribution
#     and show on the plot as (one or more) line(s).
#     Only relevant with univariate data.
# kde_kws : dict
#     Parameters that control the KDE computation, as in :func:`kdeplot`.
# line_kws : dict
#     Parameters that control the KDE visualization, passed to
#     :meth:`matplotlib.axes.Axes.plot`.
# thresh : number or None
#     Cells with a statistic less than or equal to this value will be transparent.
#     Only relevant with bivariate data.
# pthresh : number or None
#     Like ``thresh``, but a value in [0, 1] such that cells with aggregate counts
#     (or other statistics, when used) up to this proportion of the total will be
#     transparent.
# pmax : number or None
#     A value in [0, 1] that sets that saturation point for the colormap at a value
#     such that cells below is constistute this proportion of the total count (or
#     other statistic, when used).
# cbar : bool
#     If True, add a colorbar to annotate the color mapping in a bivariate plot.
#     Note: Does not currently support plots with a ``hue`` variable well.
# cbar_ax : :class:`matplotlib.axes.Axes`
#     Pre-existing axes for the colorbar.
# cbar_kws : dict
#     Additional parameters passed to :meth:`matplotlib.figure.Figure.colorbar`.
# palette : string, list, dict, or :class:`matplotlib.colors.Colormap`
#     Method for choosing the colors to use when mapping the ``hue`` semantic.
#     String values are passed to :func:`color_palette`. List or dict values
#     imply categorical mapping, while a colormap object implies numeric mapping.
# hue_order : vector of strings
#     Specify the order of processing and plotting for categorical levels of the
#     ``hue`` semantic.
# hue_norm : tuple or :class:`matplotlib.colors.Normalize`
#     Either a pair of values that set the normalization range in data units
#     or an object that will map from data units into a [0, 1] interval. Usage
#     implies numeric mapping.
# color : :mod:`matplotlib color <matplotlib.colors>`
#     Single color specification for when hue mapping is not used. Otherwise, the
#     plot will try to hook into the matplotlib property cycle.
# log_scale : bool or number, or pair of bools or numbers
#     Set axis scale(s) to log. A single value sets the data axis for univariate
#     distributions and both axes for bivariate distributions. A pair of values
#     sets each axis independently. Numeric values are interpreted as the desired
#     base (default 10). If `False`, defer to the existing Axes scale.
# legend : bool
#     If False, suppress the legend for semantic variables.
# ax : :class:`matplotlib.axes.Axes`
#     Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca`
#     internally.
# kwargs
#     Other keyword arguments are passed to one of the following matplotlib
#     functions:
#
#     - :meth:`matplotlib.axes.Axes.bar` (univariate, element="bars")
#     - :meth:`matplotlib.axes.Axes.fill_between` (univariate, other element, fill=True)
#     - :meth:`matplotlib.axes.Axes.plot` (univariate, other element, fill=False)
#     - :meth:`matplotlib.axes.Axes.pcolormesh` (bivariate)
#
# Returns
# -------
# :class:`matplotlib.axes.Axes`
#     The matplotlib axes containing the plot.
#
# See Also
# --------
# displot : Figure-level interface to distribution plot functions.
# kdeplot : Plot univariate or bivariate distributions using kernel density estimation.
# rugplot : Plot a tick at each observation value along the x and/or y axes.
# ecdfplot : Plot empirical cumulative distribution functions.
# jointplot : Draw a bivariate plot with univariate marginal distributions.
#
# Notes
# -----
#
# The choice of bins for computing and plotting a histogram can exert
# substantial influence on the insights that one is able to draw from the
# visualization. If the bins are too large, they may erase important features.
# On the other hand, bins that are too small may be dominated by random
# variability, obscuring the shape of the true underlying distribution. The
# default bin size is determined using a reference rule that depends on the
# sample size and variance. This works well in many cases, (i.e., with
# "well-behaved" data) but it fails in others. It is always a good to try
# different bin sizes to be sure that you are not missing something important.
# This function allows you to specify bins in several different ways, such as
# by setting the total number of bins to use, the width of each bin, or the
# specific locations where the bins should break.
#
# Examples
# --------
#
# .. include:: ../docstrings/histplot.rst
#
# </code>
# <a href='#33'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
sns.histplot(df['item_price']);


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>34. Data Preparation | Feature Engineering</h1>  <a id='34'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u></summary>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#34'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.fillna</u></summary>
# <blockquote>
# <code>
# Fill NA/NaN values using the specified method.
#
# Parameters
# ----------
# value : scalar, dict, Series, or DataFrame
#     Value to use to fill holes (e.g. 0), alternately a
#     dict/Series/DataFrame of values specifying which value to use for
#     each index (for a Series) or column (for a DataFrame).  Values not
#     in the dict/Series/DataFrame will not be filled. This value cannot
#     be a list.
# method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
#     Method to use for filling holes in reindexed Series
#     pad / ffill: propagate last valid observation forward to next valid
#     backfill / bfill: use next valid observation to fill gap.
# axis : {0 or 'index'}
#     Axis along which to fill missing values.
# inplace : bool, default False
#     If True, fill in-place. Note: this will modify any
#     other views on this object (e.g., a no-copy slice for a column in a
#     DataFrame).
# limit : int, default None
#     If method is specified, this is the maximum number of consecutive
#     NaN values to forward/backward fill. In other words, if there is
#     a gap with more than this number of consecutive NaNs, it will only
#     be partially filled. If method is not specified, this is the
#     maximum number of entries along the entire axis where NaNs will be
#     filled. Must be greater than 0 if not None.
# downcast : dict, default is None
#     A dict of item->dtype of what to downcast if possible,
#     or the string 'infer' which will try to downcast to an appropriate
#     equal type (e.g. float64 to int64 if possible).
#
# Returns
# -------
# Series or None
#     Object with missing values filled or None if ``inplace=True``.
#
# See Also
# --------
# interpolate : Fill NaN values using interpolation.
# reindex : Conform object to new index.
# asfreq : Convert TimeSeries to specified frequency.
#
# Examples
# --------
# >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
# ...                    [3, 4, np.nan, 1],
# ...                    [np.nan, np.nan, np.nan, np.nan],
# ...                    [np.nan, 3, np.nan, 4]],
# ...                   columns=list("ABCD"))
# >>> df
#      A    B   C    D
# 0  NaN  2.0 NaN  0.0
# 1  3.0  4.0 NaN  1.0
# 2  NaN  NaN NaN  NaN
# 3  NaN  3.0 NaN  4.0
#
# Replace all NaN elements with 0s.
#
# >>> df.fillna(0)
#      A    B    C    D
# 0  0.0  2.0  0.0  0.0
# 1  3.0  4.0  0.0  1.0
# 2  0.0  0.0  0.0  0.0
# 3  0.0  3.0  0.0  4.0
#
# We can also propagate non-null values forward or backward.
#
# >>> df.fillna(method="ffill")
#      A    B   C    D
# 0  NaN  2.0 NaN  0.0
# 1  3.0  4.0 NaN  1.0
# 2  3.0  4.0 NaN  1.0
# 3  3.0  3.0 NaN  4.0
#
# Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
# 2, and 3 respectively.
#
# >>> values = {"A": 0, "B": 1, "C": 2, "D": 3}
# >>> df.fillna(value=values)
#      A    B    C    D
# 0  0.0  2.0  2.0  0.0
# 1  3.0  4.0  2.0  1.0
# 2  0.0  1.0  2.0  3.0
# 3  0.0  3.0  2.0  4.0
#
# Only replace the first NaN element.
#
# >>> df.fillna(value=values, limit=1)
#      A    B    C    D
# 0  0.0  2.0  2.0  0.0
# 1  3.0  4.0  NaN  1.0
# 2  NaN  1.0  NaN  3.0
# 3  NaN  3.0  NaN  4.0
#
# When filling using a DataFrame, replacement happens along
# the same column names and same indices
#
# >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
# >>> df.fillna(df2)
#      A    B    C    D
# 0  0.0  2.0  0.0  0.0
# 1  3.0  4.0  0.0  1.0
# 2  0.0  0.0  0.0  NaN
# 3  0.0  3.0  0.0  4.0
#
# Note that column D is not affected since it is not present in df2.
#
# </code>
# <a href='#34'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.mean</u></summary>
# <blockquote>
# <code>
# Return the mean of the values over the requested axis.
#
# Parameters
# ----------
# axis : {index (0)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a scalar.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# scalar or Series (if level specified)
#
# </code>
# <a href='#34'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%

df = pd.concat([train,test])
#Normalize
df.item_price = np.log1p(df.item_price)
#fil l the missing
df.item_price = df.item_price.fillna(df.item_price.mean())

#rremove the outlier
df.item_cnt_day = df.item_cnt_day.apply(lambda x : 10 if x>10 else x)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>35. Data Preparation</h1>  <a id='35'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.notnull</u></summary>
# <blockquote>
# <code>
# Series.notnull is an alias for Series.notna.
#
# Detect existing (non-missing) values.
#
# Return a boolean same-sized object indicating if the values are not NA.
# Non-missing values get mapped to True. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
# NA values, such as None or :attr:`numpy.NaN`, get mapped to False
# values.
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is not an NA value.
#
# See Also
# --------
# Series.notnull : Alias of notna.
# Series.isna : Boolean inverse of notna.
# Series.dropna : Omit axes labels with missing values.
# notna : Top-level notna.
#
# Examples
# --------
# Show which entries in a DataFrame are not NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.notna()
#      age   born  name    toy
# 0   True  False  True  False
# 1   True   True  True   True
# 2  False   True  True   True
#
# Show which entries in a Series are not NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.notna()
# 0     True
# 1     True
# 2    False
# dtype: bool
#
# </code>
# <a href='#35'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.isnull</u></summary>
# <blockquote>
# <code>
# Series.isnull is an alias for Series.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# Series.isnull : Alias of isna.
# Series.notna : Boolean inverse of isna.
# Series.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#35'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
train = df[df.item_cnt_day.notnull()]
test = df[df.item_cnt_day.isnull()]


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>36. Data Preparation</h1>  <a id='36'></a><small><a href='#top_phases'>back to top</a></small>

# %%
train.shape


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>37. Data Preparation | Feature Engineering</h1>  <a id='37'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.isnull</u></summary>
# <blockquote>
# <code>
# DataFrame.isnull is an alias for DataFrame.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# DataFrame
#     Mask of bool values for each element in DataFrame that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# DataFrame.isnull : Alias of isna.
# DataFrame.notna : Boolean inverse of isna.
# DataFrame.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#37'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.series.Series.isnull</u></summary>
# <blockquote>
# <code>
# Series.isnull is an alias for Series.isna.
#
# Detect missing values.
#
# Return a boolean same-sized object indicating if the values are NA.
# NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
# values.
# Everything else gets mapped to False values. Characters such as empty
# strings ``''`` or :attr:`numpy.inf` are not considered NA values
# (unless you set ``pandas.options.mode.use_inf_as_na = True``).
#
# Returns
# -------
# Series
#     Mask of bool values for each element in Series that
#     indicates whether an element is an NA value.
#
# See Also
# --------
# Series.isnull : Alias of isna.
# Series.notna : Boolean inverse of isna.
# Series.dropna : Omit axes labels with missing values.
# isna : Top-level isna.
#
# Examples
# --------
# Show which entries in a DataFrame are NA.
#
# >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
# ...                    born=[pd.NaT, pd.Timestamp('1939-05-27'),
# ...                          pd.Timestamp('1940-04-25')],
# ...                    name=['Alfred', 'Batman', ''],
# ...                    toy=[None, 'Batmobile', 'Joker']))
# >>> df
#    age       born    name        toy
# 0  5.0        NaT  Alfred       None
# 1  6.0 1939-05-27  Batman  Batmobile
# 2  NaN 1940-04-25              Joker
#
# >>> df.isna()
#      age   born   name    toy
# 0  False   True  False   True
# 1  False  False  False  False
# 2   True  False  False  False
#
# Show which entries in a Series are NA.
#
# >>> ser = pd.Series([5, 6, np.NaN])
# >>> ser
# 0    5.0
# 1    6.0
# 2    NaN
# dtype: float64
#
# >>> ser.isna()
# 0    False
# 1    False
# 2     True
# dtype: bool
#
# </code>
# <a href='#37'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame._add_numeric_operations.<locals>.sum</u></summary>
# <blockquote>
# <code>
# Return the sum of the values over the requested axis.
#
# This is equivalent to the method ``numpy.sum``.
#
# Parameters
# ----------
# axis : {index (0), columns (1)}
#     Axis for the function to be applied on.
# skipna : bool, default True
#     Exclude NA/null values when computing the result.
# level : int or level name, default None
#     If the axis is a MultiIndex (hierarchical), count along a
#     particular level, collapsing into a Series.
# numeric_only : bool, default None
#     Include only float, int, boolean columns. If None, will attempt to use
#     everything, then use only numeric data. Not implemented for Series.
# min_count : int, default 0
#     The required number of valid values to perform the operation. If fewer than
#     ``min_count`` non-NA values are present the result will be NA.
# **kwargs
#     Additional keyword arguments to be passed to the function.
#
# Returns
# -------
# Series or DataFrame (if level specified)
#
# See Also
# --------
# Series.sum : Return the sum.
# Series.min : Return the minimum.
# Series.max : Return the maximum.
# Series.idxmin : Return the index of the minimum.
# Series.idxmax : Return the index of the maximum.
# DataFrame.sum : Return the sum over the requested axis.
# DataFrame.min : Return the minimum over the requested axis.
# DataFrame.max : Return the maximum over the requested axis.
# DataFrame.idxmin : Return the index of the minimum over the requested axis.
# DataFrame.idxmax : Return the index of the maximum over the requested axis.
#
# Examples
# --------
# >>> idx = pd.MultiIndex.from_arrays([
# ...     ['warm', 'warm', 'cold', 'cold'],
# ...     ['dog', 'falcon', 'fish', 'spider']],
# ...     names=['blooded', 'animal'])
# >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
# >>> s
# blooded  animal
# warm     dog       4
#          falcon    2
# cold     fish      0
#          spider    8
# Name: legs, dtype: int64
#
# >>> s.sum()
# 14
#
# By default, the sum of an empty or all-NA Series is ``0``.
#
# >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
# 0.0
#
# This can be controlled with the ``min_count`` parameter. For example, if
# you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
#
# >>> pd.Series([], dtype="float64").sum(min_count=1)
# nan
#
# Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
# empty series identically.
#
# >>> pd.Series([np.nan]).sum()
# 0.0
#
# >>> pd.Series([np.nan]).sum(min_count=1)
# nan
#
# </code>
# <a href='#37'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
test.isnull().sum()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>38. Data Preparation | Feature Engineering</h1>  <a id='38'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u></summary>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#38'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u></summary>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#38'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
test.drop('item_cnt_day',axis = 1,inplace  = True)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>39. Data Preparation</h1>  <a id='39'></a><small><a href='#top_phases'>back to top</a></small>

# %%
test.shape


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>40. Data Preparation | Feature Engineering</h1>  <a id='40'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.series.Series.drop</u></summary>
# <blockquote>
# <code>
# Return Series with specified index labels removed.
#
# Remove elements of a Series based on specifying the index labels.
# When using a multi-index, labels on different levels can be removed
# by specifying the level.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index labels to drop.
# axis : 0, default 0
#     Redundant for application on Series.
# index : single label or list-like
#     Redundant for application on Series, but 'index' can be used instead
#     of 'labels'.
# columns : single label or list-like
#     No change is made to the Series; use 'index' or 'labels' instead.
# level : int or level name, optional
#     For MultiIndex, level for which the labels will be removed.
# inplace : bool, default False
#     If True, do operation inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are dropped.
#
# Returns
# -------
# Series or None
#     Series with specified index labels removed or None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If none of the labels are found in the index.
#
# See Also
# --------
# Series.reindex : Return only specified index labels of Series.
# Series.dropna : Return series without null values.
# Series.drop_duplicates : Return Series with duplicate values removed.
# DataFrame.drop : Drop specified labels from rows or columns.
#
# Examples
# --------
# >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
# >>> s
# A  0
# B  1
# C  2
# dtype: int64
#
# Drop labels B en C
#
# >>> s.drop(labels=['B', 'C'])
# A  0
# dtype: int64
#
# Drop 2nd level label in MultiIndex Series
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
# ...               index=midx)
# >>> s
# lama    speed      45.0
#         weight    200.0
#         length      1.2
# cow     speed      30.0
#         weight    250.0
#         length      1.5
# falcon  speed     320.0
#         weight      1.0
#         length      0.3
# dtype: float64
#
# >>> s.drop(labels='weight', level=1)
# lama    speed      45.0
#         length      1.2
# cow     speed      30.0
#         length      1.5
# falcon  speed     320.0
#         length      0.3
# dtype: float64
#
# </code>
# <a href='#40'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame.drop</u></summary>
# <blockquote>
# <code>
# Drop specified labels from rows or columns.
#
# Remove rows or columns by specifying label names and corresponding
# axis, or by specifying directly index or column names. When using a
# multi-index, labels on different levels can be removed by specifying
# the level. See the `user guide <advanced.shown_levels>`
# for more information about the now unused levels.
#
# Parameters
# ----------
# labels : single label or list-like
#     Index or column labels to drop. A tuple will be used as a single
#     label and not treated as a list-like.
# axis : {0 or 'index', 1 or 'columns'}, default 0
#     Whether to drop labels from the index (0 or 'index') or
#     columns (1 or 'columns').
# index : single label or list-like
#     Alternative to specifying axis (``labels, axis=0``
#     is equivalent to ``index=labels``).
# columns : single label or list-like
#     Alternative to specifying axis (``labels, axis=1``
#     is equivalent to ``columns=labels``).
# level : int or level name, optional
#     For MultiIndex, level from which the labels will be removed.
# inplace : bool, default False
#     If False, return a copy. Otherwise, do operation
#     inplace and return None.
# errors : {'ignore', 'raise'}, default 'raise'
#     If 'ignore', suppress error and only existing labels are
#     dropped.
#
# Returns
# -------
# DataFrame or None
#     DataFrame without the removed index or column labels or
#     None if ``inplace=True``.
#
# Raises
# ------
# KeyError
#     If any of the labels is not found in the selected axis.
#
# See Also
# --------
# DataFrame.loc : Label-location based indexer for selection by label.
# DataFrame.dropna : Return DataFrame with labels on given axis omitted
#     where (all or any) data are missing.
# DataFrame.drop_duplicates : Return DataFrame with duplicate rows
#     removed, optionally only considering certain columns.
# Series.drop : Return Series with specified index labels removed.
#
# Examples
# --------
# >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
# ...                   columns=['A', 'B', 'C', 'D'])
# >>> df
#    A  B   C   D
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
#
# Drop columns
#
# >>> df.drop(['B', 'C'], axis=1)
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# >>> df.drop(columns=['B', 'C'])
#    A   D
# 0  0   3
# 1  4   7
# 2  8  11
#
# Drop a row by index
#
# >>> df.drop([0, 1])
#    A  B   C   D
# 2  8  9  10  11
#
# Drop columns and/or rows of MultiIndex DataFrame
#
# >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
# ...                              ['speed', 'weight', 'length']],
# ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
# ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
# >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
# ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
# ...                         [250, 150], [1.5, 0.8], [320, 250],
# ...                         [1, 0.8], [0.3, 0.2]])
# >>> df
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#         length  0.3     0.2
#
# Drop a specific index combination from the MultiIndex
# DataFrame, i.e., drop the combination ``'falcon'`` and
# ``'weight'``, which deletes only the corresponding row
#
# >>> df.drop(index=('falcon', 'weight'))
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
#         length  1.5     1.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
#         length  1.5     0.8
# falcon  speed   320.0   250.0
#         length  0.3     0.2
#
# >>> df.drop(index='cow', columns='small')
#                 big
# lama    speed   45.0
#         weight  200.0
#         length  1.5
# falcon  speed   320.0
#         weight  1.0
#         length  0.3
#
# >>> df.drop(index='length', level=1)
#                 big     small
# lama    speed   45.0    30.0
#         weight  200.0   100.0
# cow     speed   30.0    20.0
#         weight  250.0   150.0
# falcon  speed   320.0   250.0
#         weight  1.0     0.8
#
# </code>
# <a href='#40'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
x_train = train.drop('item_cnt_day',axis = 1).values
y_train = train.item_cnt_day.values


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>41. Data Preparation</h1>  <a id='41'></a><small><a href='#top_phases'>back to top</a></small>

# %%
x_test = test


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>42. Data Preparation | Feature Engineering | Library Loading</h1>  <a id='42'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>sklearn</h2>
# <ul>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler</u></summary>
# <blockquote>
# <code>
# Transform features by scaling each feature to a given range.
#
# This estimator scales and translates each feature individually such
# that it is in the given range on the training set, e.g. between
# zero and one.
#
# The transformation is given by::
#
#     X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
#     X_scaled = X_std * (max - min) + min
#
# where min, max = feature_range.
#
# This transformation is often used as an alternative to zero mean,
# unit variance scaling.
#
# Read more in the :ref:`User Guide <preprocessing_scaler>`.
#
# Parameters
# ----------
# feature_range : tuple (min, max), default=(0, 1)
#     Desired range of transformed data.
#
# copy : bool, default=True
#     Set to False to perform inplace row normalization and avoid a
#     copy (if the input is already a numpy array).
#
# clip : bool, default=False
#     Set to True to clip transformed values of held-out data to
#     provided `feature range`.
#
#     .. versionadded:: 0.24
#
# Attributes
# ----------
# min_ : ndarray of shape (n_features,)
#     Per feature adjustment for minimum. Equivalent to
#     ``min - X.min(axis=0) * self.scale_``
#
# scale_ : ndarray of shape (n_features,)
#     Per feature relative scaling of the data. Equivalent to
#     ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
#
#     .. versionadded:: 0.17
#        *scale_* attribute.
#
# data_min_ : ndarray of shape (n_features,)
#     Per feature minimum seen in the data
#
#     .. versionadded:: 0.17
#        *data_min_*
#
# data_max_ : ndarray of shape (n_features,)
#     Per feature maximum seen in the data
#
#     .. versionadded:: 0.17
#        *data_max_*
#
# data_range_ : ndarray of shape (n_features,)
#     Per feature range ``(data_max_ - data_min_)`` seen in the data
#
#     .. versionadded:: 0.17
#        *data_range_*
#
# n_features_in_ : int
#     Number of features seen during :term:`fit`.
#
#     .. versionadded:: 0.24
#
# n_samples_seen_ : int
#     The number of samples processed by the estimator.
#     It will be reset on new calls to fit, but increments across
#     ``partial_fit`` calls.
#
# feature_names_in_ : ndarray of shape (`n_features_in_`,)
#     Names of features seen during :term:`fit`. Defined only when `X`
#     has feature names that are all strings.
#
#     .. versionadded:: 1.0
#
# See Also
# --------
# minmax_scale : Equivalent function without the estimator API.
#
# Notes
# -----
# NaNs are treated as missing values: disregarded in fit, and maintained in
# transform.
#
# For a comparison of the different scalers, transformers, and normalizers,
# see :ref:`examples/preprocessing/plot_all_scaling.py
# <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
#
# Examples
# --------
# >>> from sklearn.preprocessing import MinMaxScaler
# >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
# >>> scaler = MinMaxScaler()
# >>> print(scaler.fit(data))
# MinMaxScaler()
# >>> print(scaler.data_max_)
# [ 1. 18.]
# >>> print(scaler.transform(data))
# [[0.   0.  ]
#  [0.25 0.25]
#  [0.5  0.5 ]
#  [1.   1.  ]]
# >>> print(scaler.transform([[2, 2]]))
# [[1.5 0. ]]
#
# </code>
# <a href='#42'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.base.TransformerMixin.fit_transform</u></summary>
# <blockquote>
# <code>
# Fit to data, then transform it.
#
# Fits transformer to `X` and `y` with optional parameters `fit_params`
# and returns a transformed version of `X`.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input samples.
#
# y :  array-like of shape (n_samples,) or (n_samples, n_outputs),                 default=None
#     Target values (None for unsupervised transformations).
#
# **fit_params : dict
#     Additional fit parameters.
#
# Returns
# -------
# X_new : ndarray array of shape (n_samples, n_features_new)
#     Transformed array.
#
# </code>
# <a href='#42'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>sklearn.preprocessing._data.MinMaxScaler.transform</u></summary>
# <blockquote>
# <code>
# Scale features of X according to feature_range.
#
# Parameters
# ----------
# X : array-like of shape (n_samples, n_features)
#     Input data that will be transformed.
#
# Returns
# -------
# Xt : ndarray of shape (n_samples, n_features)
#     Transformed data.
#
# </code>
# <a href='#42'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
from sklearn.preprocessing import MinMaxScaler
SC = MinMaxScaler()
#SC = StandardScaler()
x_train = SC.fit_transform(x_train)
x_test = SC.transform(x_test)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>43. Library Loading</h1>  <a id='43'></a><small><a href='#top_phases'>back to top</a></small>

# %%
import keras 
from keras.models import Sequential 
from keras.layers import Dense


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>44. Model Building and Training</h1>  <a id='44'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>keras</h2>
# <ul>
# <li>
# <details><summary><u>keras.engine.sequential.Sequential</u></summary>
# <blockquote>
# <code>
# `Sequential` groups a linear stack of layers into a `tf.keras.Model`.
#
# `Sequential` provides training and inference features on this model.
#
# Examples:
#
# ```python
# Optionally, the first layer can receive an `input_shape` argument:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
# Afterwards, we do automatic shape inference:
# model.add(tf.keras.layers.Dense(4))
#
# This is identical to the following:
# model = tf.keras.Sequential()
# model.add(tf.keras.Input(shape=(16,)))
# model.add(tf.keras.layers.Dense(8))
#
# Note that you can also omit the `input_shape` argument.
# In that case the model doesn't have any weights until the first call
# to a training/evaluation method (since it isn't yet built):
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(4))
# model.weights not created yet
#
# Whereas if you specify the input shape, the model gets built
# continuously as you are adding layers:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
# model.add(tf.keras.layers.Dense(4))
# len(model.weights)
# Returns "4"
#
# When using the delayed-build pattern (no input shape specified), you can
# choose to manually build your model by calling
# `build(batch_input_shape)`:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(4))
# model.build((None, 16))
# len(model.weights)
# Returns "4"
#
# Note that when using the delayed-build pattern (no input shape specified),
# the model gets built the first time you call `fit`, `eval`, or `predict`,
# or the first time you call the model on some input data.
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(8))
# model.add(tf.keras.layers.Dense(1))
# model.compile(optimizer='sgd', loss='mse')
# This builds the model for the first time:
# model.fit(x, y, batch_size=32, epochs=10)
# ```
#
# </code>
# <a href='#44'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.layers.core.dense.Dense</u></summary>
# <blockquote>
# <code>
# Just your regular densely-connected NN layer.
#
# `Dense` implements the operation:
# `output = activation(dot(input, kernel) + bias)`
# where `activation` is the element-wise activation function
# passed as the `activation` argument, `kernel` is a weights matrix
# created by the layer, and `bias` is a bias vector created by the layer
# (only applicable if `use_bias` is `True`). These are all attributes of
# `Dense`.
#
# Note: If the input to the layer has a rank greater than 2, then `Dense`
# computes the dot product between the `inputs` and the `kernel` along the
# last axis of the `inputs` and axis 0 of the `kernel` (using `tf.tensordot`).
# For example, if input has dimensions `(batch_size, d0, d1)`,
# then we create a `kernel` with shape `(d1, units)`, and the `kernel` operates
# along axis 2 of the `input`, on every sub-tensor of shape `(1, 1, d1)`
# (there are `batch_size * d0` such sub-tensors).
# The output in this case will have shape `(batch_size, d0, units)`.
#
# Besides, layer attributes cannot be modified after the layer has been called
# once (except the `trainable` attribute).
# When a popular kwarg `input_shape` is passed, then keras will create
# an input layer to insert before the current layer. This can be treated
# equivalent to explicitly defining an `InputLayer`.
#
# Example:
#
# >>> # Create a `Sequential` model and add a Dense layer as the first layer.
# >>> model = tf.keras.models.Sequential()
# >>> model.add(tf.keras.Input(shape=(16,)))
# >>> model.add(tf.keras.layers.Dense(32, activation='relu'))
# >>> # Now the model will take as input arrays of shape (None, 16)
# >>> # and output arrays of shape (None, 32).
# >>> # Note that after the first layer, you don't need to specify
# >>> # the size of the input anymore:
# >>> model.add(tf.keras.layers.Dense(32))
# >>> model.output_shape
# (None, 32)
#
# Args:
#   units: Positive integer, dimensionality of the output space.
#   activation: Activation function to use.
#     If you don't specify anything, no activation is applied
#     (ie. "linear" activation: `a(x) = x`).
#   use_bias: Boolean, whether the layer uses a bias vector.
#   kernel_initializer: Initializer for the `kernel` weights matrix.
#   bias_initializer: Initializer for the bias vector.
#   kernel_regularizer: Regularizer function applied to
#     the `kernel` weights matrix.
#   bias_regularizer: Regularizer function applied to the bias vector.
#   activity_regularizer: Regularizer function applied to
#     the output of the layer (its "activation").
#   kernel_constraint: Constraint function applied to
#     the `kernel` weights matrix.
#   bias_constraint: Constraint function applied to the bias vector.
#
# Input shape:
#   N-D tensor with shape: `(batch_size, ..., input_dim)`.
#   The most common situation would be
#   a 2D input with shape `(batch_size, input_dim)`.
#
# Output shape:
#   N-D tensor with shape: `(batch_size, ..., units)`.
#   For instance, for a 2D input with shape `(batch_size, input_dim)`,
#   the output would have shape `(batch_size, units)`.
#
# </code>
# <a href='#44'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.sequential.Sequential.add</u></summary>
# <blockquote>
# <code>
# Adds a layer instance on top of the layer stack.
#
# Args:
#     layer: layer instance.
#
# Raises:
#     TypeError: If `layer` is not a layer instance.
#     ValueError: In case the `layer` argument does not
#         know its input shape.
#     ValueError: In case the `layer` argument has
#         multiple output tensors, or is already connected
#         somewhere else (forbidden in `Sequential` models).
#
# </code>
# <a href='#44'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>keras.engine.training.Model.summary</u></summary>
# <blockquote>
# <code>
# Prints a string summary of the network.
#
# Args:
#     line_length: Total length of printed lines
#         (e.g. set this to adapt the display to different
#         terminal window sizes).
#     positions: Relative or absolute positions of log elements
#         in each line. If not provided,
#         defaults to `[.33, .55, .67, 1.]`.
#     print_fn: Print function to use. Defaults to `print`.
#         It will be called on each line of the summary.
#         You can set it to a custom function
#         in order to capture the string summary.
#     expand_nested: Whether to expand the nested models.
#         If not provided, defaults to `False`.
#     show_trainable: Whether to show if a layer is trainable.
#         If not provided, defaults to `False`.
#
# Raises:
#     ValueError: if `summary()` is called before the model is built.
#
# </code>
# <a href='#44'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%

# Initialising the NN
model = Sequential()

# layers
model.add(Dense(9, kernel_initializer = 'uniform', activation = 'relu', input_dim = 6))
model.add(Dense(9, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(5, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(1, kernel_initializer = 'uniform', activation = 'linear'))

# summary
model.summary()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>45. Model Building and Training</h1>  <a id='45'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>keras</h2>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.compile</u></summary>
# <blockquote>
# <code>
# Configures the model for training.
#
# Example:
#
# ```python
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
#               loss=tf.keras.losses.BinaryCrossentropy(),
#               metrics=[tf.keras.metrics.BinaryAccuracy(),
#                        tf.keras.metrics.FalseNegatives()])
# ```
#
# Args:
#     optimizer: String (name of optimizer) or optimizer instance. See
#       `tf.keras.optimizers`.
#     loss: Loss function. Maybe be a string (name of loss function), or
#       a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss
#       function is any callable with the signature `loss = fn(y_true,
#       y_pred)`, where `y_true` are the ground truth values, and
#       `y_pred` are the model's predictions.
#       `y_true` should have shape
#       `(batch_size, d0, .. dN)` (except in the case of
#       sparse loss functions such as
#       sparse categorical crossentropy which expects integer arrays of shape
#       `(batch_size, d0, .. dN-1)`).
#       `y_pred` should have shape `(batch_size, d0, .. dN)`.
#       The loss function should return a float tensor.
#       If a custom `Loss` instance is
#       used and reduction is set to `None`, return value has shape
#       `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss
#       values; otherwise, it is a scalar. If the model has multiple outputs,
#       you can use a different loss on each output by passing a dictionary
#       or a list of losses. The loss value that will be minimized by the
#       model will then be the sum of all individual losses, unless
#       `loss_weights` is specified.
#     metrics: List of metrics to be evaluated by the model during training
#       and testing. Each of this can be a string (name of a built-in
#       function), function or a `tf.keras.metrics.Metric` instance. See
#       `tf.keras.metrics`. Typically you will use `metrics=['accuracy']`. A
#       function is any callable with the signature `result = fn(y_true,
#       y_pred)`. To specify different metrics for different outputs of a
#       multi-output model, you could also pass a dictionary, such as
#       `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
#       You can also pass a list to specify a metric or a list of metrics
#       for each output, such as `metrics=[['accuracy'], ['accuracy', 'mse']]`
#       or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the
#       strings 'accuracy' or 'acc', we convert this to one of
#       `tf.keras.metrics.BinaryAccuracy`,
#       `tf.keras.metrics.CategoricalAccuracy`,
#       `tf.keras.metrics.SparseCategoricalAccuracy` based on the loss
#       function used and the model output shape. We do a similar
#       conversion for the strings 'crossentropy' and 'ce' as well.
#     loss_weights: Optional list or dictionary specifying scalar coefficients
#       (Python floats) to weight the loss contributions of different model
#       outputs. The loss value that will be minimized by the model will then
#       be the *weighted sum* of all individual losses, weighted by the
#       `loss_weights` coefficients.
#         If a list, it is expected to have a 1:1 mapping to the model's
#           outputs. If a dict, it is expected to map output names (strings)
#           to scalar coefficients.
#     weighted_metrics: List of metrics to be evaluated and weighted by
#       `sample_weight` or `class_weight` during training and testing.
#     run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s
#       logic will not be wrapped in a `tf.function`. Recommended to leave
#       this as `None` unless your `Model` cannot be run inside a
#       `tf.function`. `run_eagerly=True` is not supported when using
#       `tf.distribute.experimental.ParameterServerStrategy`.
#     steps_per_execution: Int. Defaults to 1. The number of batches to run
#       during each `tf.function` call. Running multiple batches inside a
#       single `tf.function` call can greatly improve performance on TPUs or
#       small models with a large Python overhead. At most, one full epoch
#       will be run each execution. If a number larger than the size of the
#       epoch is passed, the execution will be truncated to the size of the
#       epoch. Note that if `steps_per_execution` is set to `N`,
#       `Callback.on_batch_begin` and `Callback.on_batch_end` methods will
#       only be called every `N` batches (i.e. before/after each `tf.function`
#       execution).
#     jit_compile: If `True`, compile the model training step with XLA.
#       [XLA](https://www.tensorflow.org/xla) is an optimizing compiler for
#       machine learning.
#       `jit_compile` is not enabled for by default.
#       This option cannot be enabled with `run_eagerly=True`.
#       Note that `jit_compile=True` is
#       may not necessarily work for all models.
#       For more information on supported operations please refer to the
#       [XLA documentation](https://www.tensorflow.org/xla).
#       Also refer to
#       [known XLA issues](https://www.tensorflow.org/xla/known_issues) for
#       more details.
#     **kwargs: Arguments supported for backwards compatibility only.
#
# </code>
# <a href='#45'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
model.compile(optimizer = 'adam', loss = 'mean_absolute_error', metrics = ['mse','mae'])



# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>46. Model Building and Training</h1>  <a id='46'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>keras</h2>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.fit</u></summary>
# <blockquote>
# <code>
# Trains the model for a fixed number of epochs (iterations on a dataset).
#
# Args:
#     x: Input data. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A dict mapping input names to the corresponding array/tensors,
#         if the model has named inputs.
#       - A `tf.data` dataset. Should return a tuple
#         of either `(inputs, targets)` or
#         `(inputs, targets, sample_weights)`.
#       - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
#         or `(inputs, targets, sample_weights)`.
#       - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a
#         callable that takes a single argument of type
#         `tf.distribute.InputContext`, and returns a `tf.data.Dataset`.
#         `DatasetCreator` should be used when users prefer to specify the
#         per-replica batching and sharding logic for the `Dataset`.
#         See `tf.keras.utils.experimental.DatasetCreator` doc for more
#         information.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given below. If using
#       `tf.distribute.experimental.ParameterServerStrategy`, only
#       `DatasetCreator` type is supported for `x`.
#     y: Target data. Like the input data `x`,
#       it could be either Numpy array(s) or TensorFlow tensor(s).
#       It should be consistent with `x` (you cannot have Numpy inputs and
#       tensor targets, or inversely). If `x` is a dataset, generator,
#       or `keras.utils.Sequence` instance, `y` should
#       not be specified (since targets will be obtained from `x`).
#     batch_size: Integer or `None`.
#         Number of samples per gradient update.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of datasets, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     epochs: Integer. Number of epochs to train the model.
#         An epoch is an iteration over the entire `x` and `y`
#         data provided
#         (unless the `steps_per_epoch` flag is set to
#         something other than None).
#         Note that in conjunction with `initial_epoch`,
#         `epochs` is to be understood as "final epoch".
#         The model is not trained for a number of iterations
#         given by `epochs`, but merely until the epoch
#         of index `epochs` is reached.
#     verbose: 'auto', 0, 1, or 2. Verbosity mode.
#         0 = silent, 1 = progress bar, 2 = one line per epoch.
#         'auto' defaults to 1 for most cases, but 2 when used with
#         `ParameterServerStrategy`. Note that the progress bar is not
#         particularly useful when logged to a file, so verbose=2 is
#         recommended when not running interactively (eg, in a production
#         environment).
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during training.
#         See `tf.keras.callbacks`. Note `tf.keras.callbacks.ProgbarLogger`
#         and `tf.keras.callbacks.History` callbacks are created automatically
#         and need not be passed into `model.fit`.
#         `tf.keras.callbacks.ProgbarLogger` is created or not based on
#         `verbose` argument to `model.fit`.
#         Callbacks with batch-level calls are currently unsupported with
#         `tf.distribute.experimental.ParameterServerStrategy`, and users are
#         advised to implement epoch-level calls instead with an appropriate
#         `steps_per_epoch` value.
#     validation_split: Float between 0 and 1.
#         Fraction of the training data to be used as validation data.
#         The model will set apart this fraction of the training data,
#         will not train on it, and will evaluate
#         the loss and any model metrics
#         on this data at the end of each epoch.
#         The validation data is selected from the last samples
#         in the `x` and `y` data provided, before shuffling. This argument is
#         not supported when `x` is a dataset, generator or
#        `keras.utils.Sequence` instance.
#         `validation_split` is not yet supported with
#         `tf.distribute.experimental.ParameterServerStrategy`.
#     validation_data: Data on which to evaluate
#         the loss and any model metrics at the end of each epoch.
#         The model will not be trained on this data. Thus, note the fact
#         that the validation loss of data provided using `validation_split`
#         or `validation_data` is not affected by regularization layers like
#         noise and dropout.
#         `validation_data` will override `validation_split`.
#         `validation_data` could be:
#           - A tuple `(x_val, y_val)` of Numpy arrays or tensors.
#           - A tuple `(x_val, y_val, val_sample_weights)` of NumPy arrays.
#           - A `tf.data.Dataset`.
#           - A Python generator or `keras.utils.Sequence` returning
#           `(inputs, targets)` or `(inputs, targets, sample_weights)`.
#         `validation_data` is not yet supported with
#         `tf.distribute.experimental.ParameterServerStrategy`.
#     shuffle: Boolean (whether to shuffle the training data
#         before each epoch) or str (for 'batch'). This argument is ignored
#         when `x` is a generator or an object of tf.data.Dataset.
#         'batch' is a special option for dealing
#         with the limitations of HDF5 data; it shuffles in batch-sized
#         chunks. Has no effect when `steps_per_epoch` is not `None`.
#     class_weight: Optional dictionary mapping class indices (integers)
#         to a weight (float) value, used for weighting the loss function
#         (during training only).
#         This can be useful to tell the model to
#         "pay more attention" to samples from
#         an under-represented class.
#     sample_weight: Optional Numpy array of weights for
#         the training samples, used for weighting the loss function
#         (during training only). You can either pass a flat (1D)
#         Numpy array with the same length as the input samples
#         (1:1 mapping between weights and samples),
#         or in the case of temporal data,
#         you can pass a 2D array with shape
#         `(samples, sequence_length)`,
#         to apply a different weight to every timestep of every sample. This
#         argument is not supported when `x` is a dataset, generator, or
#        `keras.utils.Sequence` instance, instead provide the sample_weights
#         as the third element of `x`.
#     initial_epoch: Integer.
#         Epoch at which to start training
#         (useful for resuming a previous training run).
#     steps_per_epoch: Integer or `None`.
#         Total number of steps (batches of samples)
#         before declaring one epoch finished and starting the
#         next epoch. When training with input tensors such as
#         TensorFlow data tensors, the default `None` is equal to
#         the number of samples in your dataset divided by
#         the batch size, or 1 if that cannot be determined. If x is a
#         `tf.data` dataset, and 'steps_per_epoch'
#         is None, the epoch will run until the input dataset is exhausted.
#         When passing an infinitely repeating dataset, you must specify the
#         `steps_per_epoch` argument. If `steps_per_epoch=-1` the training
#         will run indefinitely with an infinitely repeating dataset.
#         This argument is not supported with array inputs.
#         When using `tf.distribute.experimental.ParameterServerStrategy`:
#           * `steps_per_epoch=None` is not supported.
#     validation_steps: Only relevant if `validation_data` is provided and
#         is a `tf.data` dataset. Total number of steps (batches of
#         samples) to draw before stopping when performing validation
#         at the end of every epoch. If 'validation_steps' is None, validation
#         will run until the `validation_data` dataset is exhausted. In the
#         case of an infinitely repeated dataset, it will run into an
#         infinite loop. If 'validation_steps' is specified and only part of
#         the dataset will be consumed, the evaluation will start from the
#         beginning of the dataset at each epoch. This ensures that the same
#         validation samples are used every time.
#     validation_batch_size: Integer or `None`.
#         Number of samples per validation batch.
#         If unspecified, will default to `batch_size`.
#         Do not specify the `validation_batch_size` if your data is in the
#         form of datasets, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     validation_freq: Only relevant if validation data is provided. Integer
#         or `collections.abc.Container` instance (e.g. list, tuple, etc.).
#         If an integer, specifies how many training epochs to run before a
#         new validation run is performed, e.g. `validation_freq=2` runs
#         validation every 2 epochs. If a Container, specifies the epochs on
#         which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
#         validation at the end of the 1st, 2nd, and 10th epochs.
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up
#         when using process-based threading. If unspecified, `workers`
#         will default to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# Unpacking behavior for iterator-like inputs:
#     A common pattern is to pass a tf.data.Dataset, generator, or
#   tf.keras.utils.Sequence to the `x` argument of fit, which will in fact
#   yield not only features (x) but optionally targets (y) and sample weights.
#   Keras requires that the output of such iterator-likes be unambiguous. The
#   iterator should return a tuple of length 1, 2, or 3, where the optional
#   second and third elements will be used for y and sample_weight
#   respectively. Any other type provided will be wrapped in a length one
#   tuple, effectively treating everything as 'x'. When yielding dicts, they
#   should still adhere to the top-level tuple structure.
#   e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate
#   features, targets, and weights from the keys of a single dict.
#     A notable unsupported data type is the namedtuple. The reason is that
#   it behaves like both an ordered datatype (tuple) and a mapping
#   datatype (dict). So given a namedtuple of the form:
#       `namedtuple("example_tuple", ["y", "x"])`
#   it is ambiguous whether to reverse the order of the elements when
#   interpreting the value. Even worse is a tuple of the form:
#       `namedtuple("other_tuple", ["x", "y", "z"])`
#   where it is unclear if the tuple was intended to be unpacked into x, y,
#   and sample_weight or passed through as a single element to `x`. As a
#   result the data processing code will simply raise a ValueError if it
#   encounters a namedtuple. (Along with instructions to remedy the issue.)
#
# Returns:
#     A `History` object. Its `History.history` attribute is
#     a record of training loss values and metrics values
#     at successive epochs, as well as validation loss values
#     and validation metrics values (if applicable).
#
# Raises:
#     RuntimeError: 1. If the model was never compiled or,
#     2. If `model.fit` is  wrapped in `tf.function`.
#
#     ValueError: In case of mismatch between the provided input data
#         and what the model expects or when the input data is empty.
#
# </code>
# <a href='#46'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
history = model.fit(x_train, y_train, epochs=32, validation_split=0.2)



# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>47. Data Preparation | Library Loading | Model Building and Training</h1>  <a id='47'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>keras</h2>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.predict</u></summary>
# <blockquote>
# <code>
# Generates output predictions for the input samples.
#
# Computation is done in batches. This method is designed for batch processing
# of large numbers of inputs. It is not intended for use inside of loops
# that iterate over your data and process small numbers of inputs at a time.
#
# For small numbers of inputs that fit in one batch,
# directly use `__call__()` for faster execution, e.g.,
# `model(x)`, or `model(x, training=False)` if you have layers such as
# `tf.keras.layers.BatchNormalization` that behave differently during
# inference. You may pair the individual model call with a `tf.function`
# for additional performance inside your inner loop.
# If you need access to numpy array values instead of tensors after your
# model call, you can use `tensor.numpy()` to get the numpy array value of
# an eager tensor.
#
# Also, note the fact that test loss is not affected by
# regularization layers like noise and dropout.
#
# Note: See [this FAQ entry](
# https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
# for more details about the difference between `Model` methods `predict()`
# and `__call__()`.
#
# Args:
#     x: Input samples. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A `tf.data` dataset.
#       - A generator or `keras.utils.Sequence` instance.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given in the `Unpacking behavior
#       for iterator-like inputs` section of `Model.fit`.
#     batch_size: Integer or `None`.
#         Number of samples per batch.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of dataset, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     verbose: Verbosity mode, 0 or 1.
#     steps: Total number of steps (batches of samples)
#         before declaring the prediction round finished.
#         Ignored with the default value of `None`. If x is a `tf.data`
#         dataset and `steps` is None, `predict()` will
#         run until the input dataset is exhausted.
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during prediction.
#         See [callbacks](/api_docs/python/tf/keras/callbacks).
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up when using
#         process-based threading. If unspecified, `workers` will default
#         to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# See the discussion of `Unpacking behavior for iterator-like inputs` for
# `Model.fit`. Note that Model.predict uses the same interpretation rules as
# `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for all
# three methods.
#
# Returns:
#     Numpy array(s) of predictions.
#
# Raises:
#     RuntimeError: If `model.predict` is wrapped in a `tf.function`.
#     ValueError: In case of mismatch between the provided
#         input data and the model's expectations,
#         or in case a stateful model receives a number of samples
#         that is not a multiple of the batch size.
#
# </code>
# <a href='#47'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <h2 class='hglib'>sklearn</h2>
# <ul>
# <li>
# <details><summary><u>sklearn.metrics._regression.mean_squared_error</u></summary>
# <blockquote>
# <code>
# Mean squared error regression loss.
#
# Read more in the :ref:`User Guide <mean_squared_error>`.
#
# Parameters
# ----------
# y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
#     Ground truth (correct) target values.
#
# y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
#     Estimated target values.
#
# sample_weight : array-like of shape (n_samples,), default=None
#     Sample weights.
#
# multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'
#     Defines aggregating of multiple output values.
#     Array-like value defines weights used to average errors.
#
#     'raw_values' :
#         Returns a full set of errors in case of multioutput input.
#
#     'uniform_average' :
#         Errors of all outputs are averaged with uniform weight.
#
# squared : bool, default=True
#     If True returns MSE value, if False returns RMSE value.
#
# Returns
# -------
# loss : float or ndarray of floats
#     A non-negative floating point value (the best value is 0.0), or an
#     array of floating point values, one for each individual target.
#
# Examples
# --------
# >>> from sklearn.metrics import mean_squared_error
# >>> y_true = [3, -0.5, 2, 7]
# >>> y_pred = [2.5, 0.0, 2, 8]
# >>> mean_squared_error(y_true, y_pred)
# 0.375
# >>> y_true = [3, -0.5, 2, 7]
# >>> y_pred = [2.5, 0.0, 2, 8]
# >>> mean_squared_error(y_true, y_pred, squared=False)
# 0.612...
# >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]
# >>> y_pred = [[0, 2],[-1, 2],[8, -5]]
# >>> mean_squared_error(y_true, y_pred)
# 0.708...
# >>> mean_squared_error(y_true, y_pred, squared=False)
# 0.822...
# >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')
# array([0.41666667, 1.        ])
# >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])
# 0.825...
#
# </code>
# <a href='#47'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
from sklearn.metrics import mean_squared_error
pred_train= model.predict(x_train)
print(np.sqrt(mean_squared_error(y_train,pred_train)))


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>48. Data Preparation | Model Building and Training</h1>  <a id='48'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>keras</h2>
# <ul>
# <li>
# <details><summary><u>keras.engine.training.Model.predict</u></summary>
# <blockquote>
# <code>
# Generates output predictions for the input samples.
#
# Computation is done in batches. This method is designed for batch processing
# of large numbers of inputs. It is not intended for use inside of loops
# that iterate over your data and process small numbers of inputs at a time.
#
# For small numbers of inputs that fit in one batch,
# directly use `__call__()` for faster execution, e.g.,
# `model(x)`, or `model(x, training=False)` if you have layers such as
# `tf.keras.layers.BatchNormalization` that behave differently during
# inference. You may pair the individual model call with a `tf.function`
# for additional performance inside your inner loop.
# If you need access to numpy array values instead of tensors after your
# model call, you can use `tensor.numpy()` to get the numpy array value of
# an eager tensor.
#
# Also, note the fact that test loss is not affected by
# regularization layers like noise and dropout.
#
# Note: See [this FAQ entry](
# https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
# for more details about the difference between `Model` methods `predict()`
# and `__call__()`.
#
# Args:
#     x: Input samples. It could be:
#       - A Numpy array (or array-like), or a list of arrays
#         (in case the model has multiple inputs).
#       - A TensorFlow tensor, or a list of tensors
#         (in case the model has multiple inputs).
#       - A `tf.data` dataset.
#       - A generator or `keras.utils.Sequence` instance.
#       A more detailed description of unpacking behavior for iterator types
#       (Dataset, generator, Sequence) is given in the `Unpacking behavior
#       for iterator-like inputs` section of `Model.fit`.
#     batch_size: Integer or `None`.
#         Number of samples per batch.
#         If unspecified, `batch_size` will default to 32.
#         Do not specify the `batch_size` if your data is in the
#         form of dataset, generators, or `keras.utils.Sequence` instances
#         (since they generate batches).
#     verbose: Verbosity mode, 0 or 1.
#     steps: Total number of steps (batches of samples)
#         before declaring the prediction round finished.
#         Ignored with the default value of `None`. If x is a `tf.data`
#         dataset and `steps` is None, `predict()` will
#         run until the input dataset is exhausted.
#     callbacks: List of `keras.callbacks.Callback` instances.
#         List of callbacks to apply during prediction.
#         See [callbacks](/api_docs/python/tf/keras/callbacks).
#     max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
#         input only. Maximum size for the generator queue.
#         If unspecified, `max_queue_size` will default to 10.
#     workers: Integer. Used for generator or `keras.utils.Sequence` input
#         only. Maximum number of processes to spin up when using
#         process-based threading. If unspecified, `workers` will default
#         to 1.
#     use_multiprocessing: Boolean. Used for generator or
#         `keras.utils.Sequence` input only. If `True`, use process-based
#         threading. If unspecified, `use_multiprocessing` will default to
#         `False`. Note that because this implementation relies on
#         multiprocessing, you should not pass non-picklable arguments to
#         the generator as they can't be passed easily to children processes.
#
# See the discussion of `Unpacking behavior for iterator-like inputs` for
# `Model.fit`. Note that Model.predict uses the same interpretation rules as
# `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for all
# three methods.
#
# Returns:
#     Numpy array(s) of predictions.
#
# Raises:
#     RuntimeError: If `model.predict` is wrapped in a `tf.function`.
#     ValueError: In case of mismatch between the provided
#         input data and the model's expectations,
#         or in case a stateful model receives a number of samples
#         that is not a multiple of the batch size.
#
# </code>
# <a href='#48'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
# <li> <h2 class='hglib'>numpy</h2>
# <ul>
# <li>
# <details><summary><u>numpy.ndarray.flatten</u></summary>
# <blockquote>
# <code>
# a.flatten(order='C')
#
# Return a copy of the array collapsed into one dimension.
#
# Parameters
# ----------
# order : {'C', 'F', 'A', 'K'}, optional
#     'C' means to flatten in row-major (C-style) order.
#     'F' means to flatten in column-major (Fortran-
#     style) order. 'A' means to flatten in column-major
#     order if `a` is Fortran *contiguous* in memory,
#     row-major order otherwise. 'K' means to flatten
#     `a` in the order the elements occur in memory.
#     The default is 'C'.
#
# Returns
# -------
# y : ndarray
#     A copy of the input array, flattened to one dimension.
#
# See Also
# --------
# ravel : Return a flattened array.
# flat : A 1-D flat iterator over the array.
#
# Examples
# --------
# >>> a = np.array([[1,2], [3,4]])
# >>> a.flatten()
# array([1, 2, 3, 4])
# >>> a.flatten('F')
# array([1, 3, 2, 4])
#
# </code>
# <a href='#48'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
y_pred = model.predict(x_test).flatten()


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>49. Data Preparation</h1>  <a id='49'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame</u></summary>
# <blockquote>
# <code>
# Two-dimensional, size-mutable, potentially heterogeneous tabular data.
#
# Data structure also contains labeled axes (rows and columns).
# Arithmetic operations align on both row and column labels. Can be
# thought of as a dict-like container for Series objects. The primary
# pandas data structure.
#
# Parameters
# ----------
# data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
#     Dict can contain Series, arrays, constants, dataclass or list-like objects. If
#     data is a dict, column order follows insertion-order. If a dict contains Series
#     which have an index defined, it is aligned by its index.
#
#     .. versionchanged:: 0.25.0
#        If data is a list of dicts, column order follows insertion-order.
#
# index : Index or array-like
#     Index to use for resulting frame. Will default to RangeIndex if
#     no indexing information part of input data and no index provided.
# columns : Index or array-like
#     Column labels to use for resulting frame when data does not have them,
#     defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
#     will perform column selection instead.
# dtype : dtype, default None
#     Data type to force. Only a single dtype is allowed. If None, infer.
# copy : bool or None, default None
#     Copy data from inputs.
#     For dict data, the default of None behaves like ``copy=True``.  For DataFrame
#     or 2d ndarray input, the default of None behaves like ``copy=False``.
#
#     .. versionchanged:: 1.3.0
#
# See Also
# --------
# DataFrame.from_records : Constructor from tuples, also record arrays.
# DataFrame.from_dict : From dicts of Series, arrays, or dicts.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_table : Read general delimited file into DataFrame.
# read_clipboard : Read text from clipboard into DataFrame.
#
# Examples
# --------
# Constructing DataFrame from a dictionary.
#
# >>> d = {'col1': [1, 2], 'col2': [3, 4]}
# >>> df = pd.DataFrame(data=d)
# >>> df
#    col1  col2
# 0     1     3
# 1     2     4
#
# Notice that the inferred dtype is int64.
#
# >>> df.dtypes
# col1    int64
# col2    int64
# dtype: object
#
# To enforce a single dtype:
#
# >>> df = pd.DataFrame(data=d, dtype=np.int8)
# >>> df.dtypes
# col1    int8
# col2    int8
# dtype: object
#
# Constructing DataFrame from a dictionary including Series:
#
# >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
# >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
#    col1  col2
# 0     0   NaN
# 1     1   NaN
# 2     2   2.0
# 3     3   3.0
#
# Constructing DataFrame from numpy ndarray:
#
# >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
# ...                    columns=['a', 'b', 'c'])
# >>> df2
#    a  b  c
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9
#
# Constructing DataFrame from a numpy ndarray that has labeled columns:
#
# >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
# ...                 dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
# >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
# ...
# >>> df3
#    c  a
# 0  3  1
# 1  6  4
# 2  9  7
#
# Constructing DataFrame from dataclass:
#
# >>> from dataclasses import make_dataclass
# >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
# >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
#    x  y
# 0  0  0
# 1  0  3
# 2  2  3
#
# </code>
# <a href='#49'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.to_csv</u></summary>
# <blockquote>
# <code>
# Write object to a comma-separated values (csv) file.
#
# Parameters
# ----------
# path_or_buf : str, path object, file-like object, or None, default None
#     String, path object (implementing os.PathLike[str]), or file-like
#     object implementing a write() function. If None, the result is
#     returned as a string. If a non-binary file object is passed, it should
#     be opened with `newline=''`, disabling universal newlines. If a binary
#     file object is passed, `mode` might need to contain a `'b'`.
#
#     .. versionchanged:: 1.2.0
#
#        Support for binary file objects was introduced.
#
# sep : str, default ','
#     String of length 1. Field delimiter for the output file.
# na_rep : str, default ''
#     Missing data representation.
# float_format : str, default None
#     Format string for floating point numbers.
# columns : sequence, optional
#     Columns to write.
# header : bool or list of str, default True
#     Write out the column names. If a list of strings is given it is
#     assumed to be aliases for the column names.
# index : bool, default True
#     Write row names (index).
# index_label : str or sequence, or False, default None
#     Column label for index column(s) if desired. If None is given, and
#     `header` and `index` are True, then the index names are used. A
#     sequence should be given if the object uses MultiIndex. If
#     False do not print fields for index names. Use index_label=False
#     for easier importing in R.
# mode : str
#     Python write mode, default 'w'.
# encoding : str, optional
#     A string representing the encoding to use in the output file,
#     defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
#     is a non-binary file object.
# compression : str or dict, default 'infer'
#     For on-the-fly compression of the output data. If 'infer' and '%s'
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to
#     ``None`` for no compression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for faster compression and to create
#     a reproducible gzip archive:
#     ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
#
#     .. versionchanged:: 1.0.0
#
#        May now be a dict with key 'method' as compression mode
#        and other entries as additional compression options if
#        compression mode is 'zip'.
#
#     .. versionchanged:: 1.1.0
#
#        Passing compression options as keys in dict is
#        supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
#
#     .. versionchanged:: 1.2.0
#
#         Compression is supported for binary file objects.
#
#     .. versionchanged:: 1.2.0
#
#         Previous versions forwarded dict entries for 'gzip' to
#         `gzip.open` instead of `gzip.GzipFile` which prevented
#         setting `mtime`.
#
# quoting : optional constant from csv module
#     Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
#     then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
#     will treat them as non-numeric.
# quotechar : str, default '\"'
#     String of length 1. Character used to quote fields.
# line_terminator : str, optional
#     The newline character or character sequence to use in the output
#     file. Defaults to `os.linesep`, which depends on the OS in which
#     this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
# chunksize : int or None
#     Rows to write at a time.
# date_format : str, default None
#     Format string for datetime objects.
# doublequote : bool, default True
#     Control quoting of `quotechar` inside a field.
# escapechar : str, default None
#     String of length 1. Character used to escape `sep` and `quotechar`
#     when appropriate.
# decimal : str, default '.'
#     Character recognized as decimal separator. E.g. use ',' for
#     European data.
# errors : str, default 'strict'
#     Specifies how encoding and decoding errors are to be handled.
#     See the errors argument for :func:`open` for a full list
#     of options.
#
#     .. versionadded:: 1.1.0
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2.0
#
# Returns
# -------
# None or str
#     If path_or_buf is None, returns the resulting csv format as a
#     string. Otherwise returns None.
#
# See Also
# --------
# read_csv : Load a CSV file into a DataFrame.
# to_excel : Write DataFrame to an Excel file.
#
# Examples
# --------
# >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
# ...                    'mask': ['red', 'purple'],
# ...                    'weapon': ['sai', 'bo staff']})
# >>> df.to_csv(index=False)
# 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
#
# Create 'out.zip' containing 'out.csv'
#
# >>> compression_opts = dict(method='zip',
# ...                         archive_name='out.csv')  # doctest: +SKIP
# >>> df.to_csv('out.zip', index=False,
# ...           compression=compression_opts)  # doctest: +SKIP
#
# To write a csv file to a new folder or nested folder you will first
# need to create it using either Pathlib or os:
#
# >>> from pathlib import Path  # doctest: +SKIP
# >>> filepath = Path('folder/subfolder/out.csv')  # doctest: +SKIP
# >>> filepath.parent.mkdir(parents=True, exist_ok=True)  # doctest: +SKIP
# >>> df.to_csv(filepath)  # doctest: +SKIP
#
# >>> import os  # doctest: +SKIP
# >>> os.makedirs('folder/subfolder', exist_ok=True)  # doctest: +SKIP
# >>> df.to_csv('folder/subfolder/out.csv')  # doctest: +SKIP
#
# </code>
# <a href='#49'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
output = pd.DataFrame({'ID': submission['ID'], 'item_cnt_month': y_pred})
output.to_csv('submission1.csv', index=False)


# %% [markdown] deletable=false editable=false run_control={"frozen": true}
# <h1 class='hg'>50. Data Preparation | Feature Engineering</h1>  <a id='50'></a><small><a href='#top_phases'>back to top</a></small><details><summary><u>View function documentation</u></summary>
# <ul>
#
# <li> <h2 class='hglib'>pandas</h2>
# <ul>
# <li>
# <details><summary><u>pandas.core.frame.DataFrame</u></summary>
# <blockquote>
# <code>
# Two-dimensional, size-mutable, potentially heterogeneous tabular data.
#
# Data structure also contains labeled axes (rows and columns).
# Arithmetic operations align on both row and column labels. Can be
# thought of as a dict-like container for Series objects. The primary
# pandas data structure.
#
# Parameters
# ----------
# data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
#     Dict can contain Series, arrays, constants, dataclass or list-like objects. If
#     data is a dict, column order follows insertion-order. If a dict contains Series
#     which have an index defined, it is aligned by its index.
#
#     .. versionchanged:: 0.25.0
#        If data is a list of dicts, column order follows insertion-order.
#
# index : Index or array-like
#     Index to use for resulting frame. Will default to RangeIndex if
#     no indexing information part of input data and no index provided.
# columns : Index or array-like
#     Column labels to use for resulting frame when data does not have them,
#     defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
#     will perform column selection instead.
# dtype : dtype, default None
#     Data type to force. Only a single dtype is allowed. If None, infer.
# copy : bool or None, default None
#     Copy data from inputs.
#     For dict data, the default of None behaves like ``copy=True``.  For DataFrame
#     or 2d ndarray input, the default of None behaves like ``copy=False``.
#
#     .. versionchanged:: 1.3.0
#
# See Also
# --------
# DataFrame.from_records : Constructor from tuples, also record arrays.
# DataFrame.from_dict : From dicts of Series, arrays, or dicts.
# read_csv : Read a comma-separated values (csv) file into DataFrame.
# read_table : Read general delimited file into DataFrame.
# read_clipboard : Read text from clipboard into DataFrame.
#
# Examples
# --------
# Constructing DataFrame from a dictionary.
#
# >>> d = {'col1': [1, 2], 'col2': [3, 4]}
# >>> df = pd.DataFrame(data=d)
# >>> df
#    col1  col2
# 0     1     3
# 1     2     4
#
# Notice that the inferred dtype is int64.
#
# >>> df.dtypes
# col1    int64
# col2    int64
# dtype: object
#
# To enforce a single dtype:
#
# >>> df = pd.DataFrame(data=d, dtype=np.int8)
# >>> df.dtypes
# col1    int8
# col2    int8
# dtype: object
#
# Constructing DataFrame from a dictionary including Series:
#
# >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
# >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
#    col1  col2
# 0     0   NaN
# 1     1   NaN
# 2     2   2.0
# 3     3   3.0
#
# Constructing DataFrame from numpy ndarray:
#
# >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
# ...                    columns=['a', 'b', 'c'])
# >>> df2
#    a  b  c
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9
#
# Constructing DataFrame from a numpy ndarray that has labeled columns:
#
# >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
# ...                 dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
# >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
# ...
# >>> df3
#    c  a
# 0  3  1
# 1  6  4
# 2  9  7
#
# Constructing DataFrame from dataclass:
#
# >>> from dataclasses import make_dataclass
# >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
# >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
#    x  y
# 0  0  0
# 1  0  3
# 2  2  3
#
# </code>
# <a href='#50'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.reshape.concat.concat</u></summary>
# <blockquote>
# <code>
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes.
#
# Can also add a layer of hierarchical indexing on the concatenation axis,
# which may be useful if the labels are the same (or overlapping) on
# the passed axis number.
#
# Parameters
# ----------
# objs : a sequence or mapping of Series or DataFrame objects
#     If a mapping is passed, the sorted keys will be used as the `keys`
#     argument, unless it is passed, in which case the values will be
#     selected (see below). Any None objects will be dropped silently unless
#     they are all None in which case a ValueError will be raised.
# axis : {0/'index', 1/'columns'}, default 0
#     The axis to concatenate along.
# join : {'inner', 'outer'}, default 'outer'
#     How to handle indexes on other axis (or axes).
# ignore_index : bool, default False
#     If True, do not use the index values along the concatenation axis. The
#     resulting axis will be labeled 0, ..., n - 1. This is useful if you are
#     concatenating objects where the concatenation axis does not have
#     meaningful indexing information. Note the index values on the other
#     axes are still respected in the join.
# keys : sequence, default None
#     If multiple levels passed, should contain tuples. Construct
#     hierarchical index using the passed keys as the outermost level.
# levels : list of sequences, default None
#     Specific levels (unique values) to use for constructing a
#     MultiIndex. Otherwise they will be inferred from the keys.
# names : list, default None
#     Names for the levels in the resulting hierarchical index.
# verify_integrity : bool, default False
#     Check whether the new concatenated axis contains duplicates. This can
#     be very expensive relative to the actual data concatenation.
# sort : bool, default False
#     Sort non-concatenation axis if it is not already aligned when `join`
#     is 'outer'.
#     This has no effect when ``join='inner'``, which already preserves
#     the order of the non-concatenation axis.
#
#     .. versionchanged:: 1.0.0
#
#        Changed to not sort by default.
#
# copy : bool, default True
#     If False, do not copy data unnecessarily.
#
# Returns
# -------
# object, type of objs
#     When concatenating all ``Series`` along the index (axis=0), a
#     ``Series`` is returned. When ``objs`` contains at least one
#     ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
#     the columns (axis=1), a ``DataFrame`` is returned.
#
# See Also
# --------
# Series.append : Concatenate Series.
# DataFrame.append : Concatenate DataFrames.
# DataFrame.join : Join DataFrames using indexes.
# DataFrame.merge : Merge DataFrames by indexes or columns.
#
# Notes
# -----
# The keys, levels, and names arguments are all optional.
#
# A walkthrough of how this method fits in with other tools for combining
# pandas objects can be found `here
# <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
#
# Examples
# --------
# Combine two ``Series``.
#
# >>> s1 = pd.Series(['a', 'b'])
# >>> s2 = pd.Series(['c', 'd'])
# >>> pd.concat([s1, s2])
# 0    a
# 1    b
# 0    c
# 1    d
# dtype: object
#
# Clear the existing index and reset it in the result
# by setting the ``ignore_index`` option to ``True``.
#
# >>> pd.concat([s1, s2], ignore_index=True)
# 0    a
# 1    b
# 2    c
# 3    d
# dtype: object
#
# Add a hierarchical index at the outermost level of
# the data with the ``keys`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'])
# s1  0    a
#     1    b
# s2  0    c
#     1    d
# dtype: object
#
# Label the index keys you create with the ``names`` option.
#
# >>> pd.concat([s1, s2], keys=['s1', 's2'],
# ...           names=['Series name', 'Row ID'])
# Series name  Row ID
# s1           0         a
#              1         b
# s2           0         c
#              1         d
# dtype: object
#
# Combine two ``DataFrame`` objects with identical columns.
#
# >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
# ...                    columns=['letter', 'number'])
# >>> df1
#   letter  number
# 0      a       1
# 1      b       2
# >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
# ...                    columns=['letter', 'number'])
# >>> df2
#   letter  number
# 0      c       3
# 1      d       4
# >>> pd.concat([df1, df2])
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects with overlapping columns
# and return everything. Columns outside the intersection will
# be filled with ``NaN`` values.
#
# >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
# ...                    columns=['letter', 'number', 'animal'])
# >>> df3
#   letter  number animal
# 0      c       3    cat
# 1      d       4    dog
# >>> pd.concat([df1, df3], sort=False)
#   letter  number animal
# 0      a       1    NaN
# 1      b       2    NaN
# 0      c       3    cat
# 1      d       4    dog
#
# Combine ``DataFrame`` objects with overlapping columns
# and return only those that are shared by passing ``inner`` to
# the ``join`` keyword argument.
#
# >>> pd.concat([df1, df3], join="inner")
#   letter  number
# 0      a       1
# 1      b       2
# 0      c       3
# 1      d       4
#
# Combine ``DataFrame`` objects horizontally along the x axis by
# passing in ``axis=1``.
#
# >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
# ...                    columns=['animal', 'name'])
# >>> pd.concat([df1, df4], axis=1)
#   letter  number  animal    name
# 0      a       1    bird   polly
# 1      b       2  monkey  george
#
# Prevent the result from including duplicate index values with the
# ``verify_integrity`` option.
#
# >>> df5 = pd.DataFrame([1], index=['a'])
# >>> df5
#    0
# a  1
# >>> df6 = pd.DataFrame([2], index=['a'])
# >>> df6
#    0
# a  2
# >>> pd.concat([df5, df6], verify_integrity=True)
# Traceback (most recent call last):
#     ...
# ValueError: Indexes have overlapping values: ['a']
#
# </code>
# <a href='#50'>back to header</a>
# </blockquote>
# </details>
# </li>
# <li>
# <details><summary><u>pandas.core.generic.NDFrame.to_csv</u></summary>
# <blockquote>
# <code>
# Write object to a comma-separated values (csv) file.
#
# Parameters
# ----------
# path_or_buf : str, path object, file-like object, or None, default None
#     String, path object (implementing os.PathLike[str]), or file-like
#     object implementing a write() function. If None, the result is
#     returned as a string. If a non-binary file object is passed, it should
#     be opened with `newline=''`, disabling universal newlines. If a binary
#     file object is passed, `mode` might need to contain a `'b'`.
#
#     .. versionchanged:: 1.2.0
#
#        Support for binary file objects was introduced.
#
# sep : str, default ','
#     String of length 1. Field delimiter for the output file.
# na_rep : str, default ''
#     Missing data representation.
# float_format : str, default None
#     Format string for floating point numbers.
# columns : sequence, optional
#     Columns to write.
# header : bool or list of str, default True
#     Write out the column names. If a list of strings is given it is
#     assumed to be aliases for the column names.
# index : bool, default True
#     Write row names (index).
# index_label : str or sequence, or False, default None
#     Column label for index column(s) if desired. If None is given, and
#     `header` and `index` are True, then the index names are used. A
#     sequence should be given if the object uses MultiIndex. If
#     False do not print fields for index names. Use index_label=False
#     for easier importing in R.
# mode : str
#     Python write mode, default 'w'.
# encoding : str, optional
#     A string representing the encoding to use in the output file,
#     defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
#     is a non-binary file object.
# compression : str or dict, default 'infer'
#     For on-the-fly compression of the output data. If 'infer' and '%s'
#     path-like, then detect compression from the following extensions: '.gz',
#     '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to
#     ``None`` for no compression. Can also be a dict with key ``'method'`` set
#     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
#     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
#     ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
#     example, the following could be passed for faster compression and to create
#     a reproducible gzip archive:
#     ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
#
#     .. versionchanged:: 1.0.0
#
#        May now be a dict with key 'method' as compression mode
#        and other entries as additional compression options if
#        compression mode is 'zip'.
#
#     .. versionchanged:: 1.1.0
#
#        Passing compression options as keys in dict is
#        supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
#
#     .. versionchanged:: 1.2.0
#
#         Compression is supported for binary file objects.
#
#     .. versionchanged:: 1.2.0
#
#         Previous versions forwarded dict entries for 'gzip' to
#         `gzip.open` instead of `gzip.GzipFile` which prevented
#         setting `mtime`.
#
# quoting : optional constant from csv module
#     Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
#     then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
#     will treat them as non-numeric.
# quotechar : str, default '\"'
#     String of length 1. Character used to quote fields.
# line_terminator : str, optional
#     The newline character or character sequence to use in the output
#     file. Defaults to `os.linesep`, which depends on the OS in which
#     this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
# chunksize : int or None
#     Rows to write at a time.
# date_format : str, default None
#     Format string for datetime objects.
# doublequote : bool, default True
#     Control quoting of `quotechar` inside a field.
# escapechar : str, default None
#     String of length 1. Character used to escape `sep` and `quotechar`
#     when appropriate.
# decimal : str, default '.'
#     Character recognized as decimal separator. E.g. use ',' for
#     European data.
# errors : str, default 'strict'
#     Specifies how encoding and decoding errors are to be handled.
#     See the errors argument for :func:`open` for a full list
#     of options.
#
#     .. versionadded:: 1.1.0
#
# storage_options : dict, optional
#     Extra options that make sense for a particular storage connection, e.g.
#     host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
#     are forwarded to ``urllib`` as header options. For other URLs (e.g.
#     starting with "s3://", and "gcs://") the key-value pairs are forwarded to
#     ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
#
#     .. versionadded:: 1.2.0
#
# Returns
# -------
# None or str
#     If path_or_buf is None, returns the resulting csv format as a
#     string. Otherwise returns None.
#
# See Also
# --------
# read_csv : Load a CSV file into a DataFrame.
# to_excel : Write DataFrame to an Excel file.
#
# Examples
# --------
# >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
# ...                    'mask': ['red', 'purple'],
# ...                    'weapon': ['sai', 'bo staff']})
# >>> df.to_csv(index=False)
# 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
#
# Create 'out.zip' containing 'out.csv'
#
# >>> compression_opts = dict(method='zip',
# ...                         archive_name='out.csv')  # doctest: +SKIP
# >>> df.to_csv('out.zip', index=False,
# ...           compression=compression_opts)  # doctest: +SKIP
#
# To write a csv file to a new folder or nested folder you will first
# need to create it using either Pathlib or os:
#
# >>> from pathlib import Path  # doctest: +SKIP
# >>> filepath = Path('folder/subfolder/out.csv')  # doctest: +SKIP
# >>> filepath.parent.mkdir(parents=True, exist_ok=True)  # doctest: +SKIP
# >>> df.to_csv(filepath)  # doctest: +SKIP
#
# >>> import os  # doctest: +SKIP
# >>> os.makedirs('folder/subfolder', exist_ok=True)  # doctest: +SKIP
# >>> df.to_csv('folder/subfolder/out.csv')  # doctest: +SKIP
#
# </code>
# <a href='#50'>back to header</a>
# </blockquote>
# </details>
# </li>
# </ul>
# </li>
#
# </ul>
# </details>

# %%
pred=pd.DataFrame(y_pred)
datasets=pd.concat([submission['ID'],pred],axis=1)
datasets.columns=['ID','item_cnt_day']
datasets.to_csv('new_submission.csv',index=False)

# %%
