Source code for groclient.experimental

import pandas as pd

from typing import Dict, List

from groclient.client import GroClient
from groclient import lib
from groclient.constants import V2_DATA_DESCRIPTION_PREFIX, V2_DATA_DESCRIPTION_ATTRS

class Experimental(GroClient):
    """The experimental client will introduce a range of experimental functions with better user experience.
    While you will be able to access better performance and new features at an early stage,
    you should be aware that things might change (e.g response format)."""

[docs]    def get_data_points(self, **selections: Dict) -> List[Dict]:
        """This function is a mirror of existing :meth:`~groclient.GroClient.get_data_points`, but with limited scope.

        For example:

        - "Gro derived on-the-fly" is under development.

        - Many sources are still under migration (please refer to internal confluence page for source migration timeline)

        Parameters
        ----------
        metric_id : integer
            How something is measured. e.g. "Export Value" or "Area Harvested"
        item_ids : integer or list of integers
            What is being measured. e.g. "Corn" or "Rainfall"
        region_ids : integer or list of integers
            Where something is being measured e.g. "United States Corn Belt" or "China"
        partner_region_ids : integer or list of integers, optional
            partner_region refers to an interaction between two regions, like trade or
            transportation. For example, for an Export metric, the "region" would be the exporter
            and the "partner_region" would be the importer. For most series, this can be excluded
            or set to 0 ("World") by default.
        source_id : integer
        frequency_id : integer
        unit_id : integer, optional
        start_date : string, optional
            All data points with end dates after this date.
        end_date : string, optional
            All data points with start dates before this date.
        coverage_threshold : float, optional
            Custom threshold on the coverage of geospatial data. Value should be between 0 and 1.

        Returns
        -------
        dict
            dictionary containing list of data_points and series_description

            Example::

                from groclient.experimental import Experimental

                exp_client = Experimental(access_token="your_token_here")
                exp_client.get_data_points(
                    **{
                        'metric_id': 2540047,
                        'item_ids': [3457],
                        'region_ids': [100023971, 100023990],
                        'frequency_id': 1,
                        'source_id': 26,
                        'start_date': '2021-12-20',
                        'end_date': '2021-12-21',
                    }
                )

            Returns::

                [
                    {
                        "data_points": [
                            {
                                "value": 33.20465087890625,
                                "start_timestamp": "1639958400",
                                "end_timestamp": "1640044800"
                            }
                        ],
                        "series_description": {
                            "source_id": 26,
                            "item_id": 3457,
                            "metric_id": 2540047,
                            "frequency_id": 1,
                            "region_id": 100023971,
                            "unit_id": 36
                        }
                    },
                    {
                        "data_points": [
                            {
                                "value": 32.73432922363281,
                                "start_timestamp": "1639958400",
                                "end_timestamp": "1640044800"
                            }
                        ],
                        "series_description": {
                            "source_id": 26,
                            "item_id": 3457,
                            "metric_id": 2540047,
                            "frequency_id": 1,
                            "region_id": 100023990,
                            "unit_id": 36
                        }
                    }
                ]
        """
        data_stream_list = lib.get_data_points_v2_prime(
            self.access_token, self.api_host, **selections
        )

        # due to the issue in javascript when dealing with 'int64'
        # here we would manually convert timestamp from str to int
        for data_stream in data_stream_list:
            for data_point in data_stream['data_points']:
                data_point['start_timestamp'] = int(data_point['start_timestamp'])
                data_point['end_timestamp'] = int(data_point['end_timestamp'])

        return data_stream_list


    def get_data_points_df(self, **selections: Dict) -> pd.DataFrame:
        """Call :meth:`~groclient.Experimental.get_data_points` and return as a combined
        dataframe.

        Parameters
        ----------
        metric_id : integer
            How something is measured. e.g. "Export Value" or "Area Harvested"
        item_ids : integer or list of integers
            What is being measured. e.g. "Corn" or "Rainfall"
        region_ids : integer or list of integers
            Where something is being measured e.g. "United States Corn Belt" or "China"
        partner_region_ids : integer or list of integers, optional
            partner_region refers to an interaction between two regions, like trade or
            transportation. For example, for an Export metric, the "region" would be the exporter
            and the "partner_region" would be the importer. For most series, this can be excluded
            or set to 0 ("World") by default.
        source_id : integer
        frequency_id : integer
        unit_id : integer, optional
        start_date : string, optional
            All data points with end dates after this date.
        end_date : string, optional
            All data points with start dates before this date.
        coverage_threshold : float, optional
            Custom threshold on the coverage of geospatial data. Value should be between 0 and 1.

        Returns
        -------
        pandas.DataFrame
            The results from :meth:`~groclient.Experimental.get_data_points`, appended together
            into a single dataframe.
            Data point attributes in timestamp format (e.g `start_timestamp`, `end_timestamp`)
            will be converted into human readable format (`YYYY-MM-DD`), and renamed as
            `start_date` and `end_date`

            Example::

                from groclient.experimental import Experimental

                exp_client = Experimental(access_token="your_token_here")
                exp_client.get_data_points_df(
                                        **{
                                            'metric_id': 2540047,
                                            'item_ids': [3457],
                                            'region_ids': [100023971, 100023990],
                                            'frequency_id': 1,
                                            'source_id': 26,
                                            'start_date': '2021-12-20',
                                            'end_date': '2021-12-21',
                                        }
                                    )

            Returns::

                       value start_timestamp end_timestamp metric_id item_id  region_id partner_region_id frequency_id source_id unit_id
                0  33.204651      2021-12-20    2021-12-21   2540047    3457  100023971               NaN            1        26      36
                1  32.734329      2021-12-20    2021-12-21   2540047    3457  100023990               NaN            1        26      36
        """
        res = lib.get_data_points_v2_prime(
            self.access_token, self.api_host, **selections
        )

        v2_data_description_meta = [
            [V2_DATA_DESCRIPTION_PREFIX, x] for x in V2_DATA_DESCRIPTION_ATTRS
        ]
        df = pd.json_normalize(
            res, record_path=['data_points'], meta=v2_data_description_meta, errors='ignore'
        )

        if not df.empty:
            ts_cols = ["start_timestamp", "end_timestamp"]
            df[ts_cols] = df[ts_cols].apply(pd.to_datetime, unit="s")

            df.columns = df.columns.str.replace('series_description.', '')
            df[V2_DATA_DESCRIPTION_ATTRS] = df[V2_DATA_DESCRIPTION_ATTRS].apply(pd.to_numeric)

        return df