Module `destination_based_sales.bea`

This module is used to load and preprocess data from the Bureau of Economic Analysis (BEA). These allow to split revenue variables between sales directed to the host (or affiliate) country, to the US and to any third country. Data are loaded from Excel files saved in the "data" folder.

Expand source code

"""
This module is used to load and preprocess data from the Bureau of Economic Analysis (BEA). These allow to split revenue
variables between sales directed to the host (or affiliate) country, to the US and to any third country. Data are loaded
from Excel files saved in the "data" folder.
"""


########################################################################################################################
# --- Imports

import os

import numpy as np
import pandas as pd

from destination_based_sales.utils import CODES_TO_IMPUTE_BEA, impute_missing_codes


########################################################################################################################
# --- Diverse

path_to_dir = os.path.dirname(os.path.abspath(__file__))

path_to_geographies = os.path.join(path_to_dir, 'data', 'geographies.csv')


########################################################################################################################
# --- Content

class BEADataPreprocessor:

    def __init__(
        self,
        year,
        path_to_dir=path_to_dir,
        path_to_geo_file=path_to_geographies
    ):
        """
        The instructions allowing to load and preprocess BEA data are organised in a Python class, BEADataPreprocessor.

        This is the instantiation function for this class. It requires several arguments:

        - the year to consider (for now, one of 2016, 2017 or 2018);
        - the path to the directory where this Python file is located, to retrieve the appropriate data file;
        - the path to the "geographies.csv" file, used for instance to complement BEA data with country codes.
        """
        self.year = year

        # We construct the path to the relevant data file, which depends on the year considered
        self.path_to_bea = os.path.join(
            path_to_dir,
            'data',
            str(year),
            'Part-II-E1-E17.xls'
        )

        self.path_to_geo_file = path_to_geo_file

        self.CODES_TO_IMPUTE = CODES_TO_IMPUTE_BEA.copy()

    def load_data(self):
        """
        This class method is used to load and clean the data from the BEA. It relies on the data file paths, saved as
        class attributes when the instantiation function is called. Preprocessing steps are detailed in comments below.
        """

        # We load the data from the appropriate Excel file
        bea = pd.read_excel(self.path_to_bea, sheet_name='Table II.E 2')

        # We rename columns, the following column names being used throughout the code
        bea.columns = [
            'AFFILIATE_COUNTRY_NAME', 'TOTAL', 'TOTAL_US', 'TOTAL_US_RELATED', 'TOTAL_US_UNRELATED', 'TOTAL_FOREIGN',
            'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_AFFILIATE_COUNTRY_RELATED', 'TOTAL_AFFILIATE_COUNTRY_UNRELATED',
            'TOTAL_OTHER_COUNTRY', 'TOTAL_OTHER_COUNTRY_RELATED', 'TOTAL_OTHER_COUNTRY_UNRELATED'
        ]

        # We only keep relevant rows
        bea = bea.loc[8:].copy()

        bea = bea[~(bea.isnull().sum(axis=1) >= 11)].copy()

        bea = bea.iloc[:-2].copy()

        bea = bea[bea['AFFILIATE_COUNTRY_NAME'] != 'Latin America and Other Western Hemisphere'].copy()

        # We re-index the DataFrame after having filtered out inappropriate rows
        bea.reset_index(inplace=True, drop=True)

        # Due to the organisation of the Excel file, the DataFrame contains rows that only display the name of the con-
        # tinent associated with countries below; we want to eliminate these rows and reconstitute a "one-block" dataset
        continent_names = [
            'Europe',
            'South America',
            'Central America',
            'Other Western Hemisphere',
            'Africa',
            'Middle East',
            'Asia and Pacific',
        ]

        # We fetch the list of the indices of these rows
        total_indices = list(
            bea[
                bea['AFFILIATE_COUNTRY_NAME'].isin(continent_names)
            ].index
        )

        # We will store the sub-DataFrames associated with each continent in a dedicated dictionary
        continent_extracts = {}

        for i, continent_name in enumerate(continent_names):
            if i + 1 < len(total_indices):
                continent_df = bea.loc[total_indices[i]:total_indices[i + 1] - 1].copy()

            else:
                continent_df = bea.loc[total_indices[i]:bea.index[-1]].copy()

            # In each sub-DataFrame, we rename the "Other" row as "Other [+ CONTINENT NAME]"
            continent_df['AFFILIATE_COUNTRY_NAME'] = continent_df['AFFILIATE_COUNTRY_NAME'].map(
                lambda country_name: country_name if country_name != 'Other' else 'Other ' + continent_name
            )

            continent_df = continent_df[continent_df['AFFILIATE_COUNTRY_NAME'] != continent_name].copy()

            continent_extracts[continent_name] = continent_df.copy()

        # The Canada row is outside any continent block
        bea_cleaned = bea[bea['AFFILIATE_COUNTRY_NAME'] == 'Canada'].copy()

        # Upon it, we stack the different continent blocks to obtain one "continuous" dataset
        for continent_extract in continent_extracts.values():
            bea_cleaned = pd.concat([bea_cleaned, continent_extract], axis=0)

        # We eventually reformat missing values
        for column in bea_cleaned.columns[1:]:
            bea_cleaned[column] = bea_cleaned[column].map(
                lambda x: np.nan if x == '(D)' else x
            )

        return bea_cleaned.reset_index(drop=True)

    def load_data_with_geo_codes(self):
        """
        This class method is used to add geographical ISO codes to the raw dataset, loaded with the "load_data" method.
        It relies on the "impute_missing_codes" function defined in utils.py.
        """
        bea = self.load_data()

        geographies = pd.read_csv(self.path_to_geo_file)

        # We merge the DataFrame containing raw BEA data with the one containing the ISO code correspondences
        merged_df = bea.merge(
            geographies,
            how='left',
            left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME'
        )

        # We add missing codes
        for column in ['NAME', 'CODE', 'CONTINENT_NAME', 'CONTINENT_CODE']:
            merged_df[column] = merged_df.apply(
                lambda row: impute_missing_codes(
                    row=row,
                    column=column,
                    codes_to_impute=self.CODES_TO_IMPUTE
                ),
                axis=1
            )

        # We don't consider "Other" aggregates as they are not the same as in the IRS data
        merged_df = merged_df[~merged_df['CODE'].isnull()].copy()
        merged_df = merged_df[merged_df['CODE'].map(len) <= 3].copy()

        return merged_df.copy()

    def load_final_data(self):
        """
        This class method allows to load the fully preprocessed BEA data. Relying on the "load_data_with_geo_codes" me-
        thod, continent names and codes are limited to 4 pairs, corresponding respectively to Europe, Africa, America
        (North and South) and Asia-Pacific (gathering Asia and Oceania).
        """
        bea = self.load_data_with_geo_codes()

        bea['CONTINENT_CODE'] = bea['CONTINENT_CODE'].map(
            lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
        )

        bea['CONTINENT_NAME'] = bea['CONTINENT_NAME'].map(
            lambda x: 'America' if x in ['South America', 'North America'] else x
        )

        bea['CONTINENT_CODE'] = bea['CONTINENT_CODE'].map(
            lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
        )

        bea['CONTINENT_NAME'] = bea['CONTINENT_NAME'].map(
            lambda x: 'Asia-Pacific' if x in ['Asia', 'Oceania'] or x is None else x
        )

        return bea.copy()

Classes

class BEADataPreprocessor (year, path_to_dir='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales', path_to_geo_file='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales/data/geographies.csv')

The instructions allowing to load and preprocess BEA data are organised in a Python class, BEADataPreprocessor.

This is the instantiation function for this class. It requires several arguments:

the year to consider (for now, one of 2016, 2017 or 2018);
the path to the directory where this Python file is located, to retrieve the appropriate data file;
the path to the "geographies.csv" file, used for instance to complement BEA data with country codes.

Expand source code

class BEADataPreprocessor:

    def __init__(
        self,
        year,
        path_to_dir=path_to_dir,
        path_to_geo_file=path_to_geographies
    ):
        """
        The instructions allowing to load and preprocess BEA data are organised in a Python class, BEADataPreprocessor.

        This is the instantiation function for this class. It requires several arguments:

        - the year to consider (for now, one of 2016, 2017 or 2018);
        - the path to the directory where this Python file is located, to retrieve the appropriate data file;
        - the path to the "geographies.csv" file, used for instance to complement BEA data with country codes.
        """
        self.year = year

        # We construct the path to the relevant data file, which depends on the year considered
        self.path_to_bea = os.path.join(
            path_to_dir,
            'data',
            str(year),
            'Part-II-E1-E17.xls'
        )

        self.path_to_geo_file = path_to_geo_file

        self.CODES_TO_IMPUTE = CODES_TO_IMPUTE_BEA.copy()

    def load_data(self):
        """
        This class method is used to load and clean the data from the BEA. It relies on the data file paths, saved as
        class attributes when the instantiation function is called. Preprocessing steps are detailed in comments below.
        """

        # We load the data from the appropriate Excel file
        bea = pd.read_excel(self.path_to_bea, sheet_name='Table II.E 2')

        # We rename columns, the following column names being used throughout the code
        bea.columns = [
            'AFFILIATE_COUNTRY_NAME', 'TOTAL', 'TOTAL_US', 'TOTAL_US_RELATED', 'TOTAL_US_UNRELATED', 'TOTAL_FOREIGN',
            'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_AFFILIATE_COUNTRY_RELATED', 'TOTAL_AFFILIATE_COUNTRY_UNRELATED',
            'TOTAL_OTHER_COUNTRY', 'TOTAL_OTHER_COUNTRY_RELATED', 'TOTAL_OTHER_COUNTRY_UNRELATED'
        ]

        # We only keep relevant rows
        bea = bea.loc[8:].copy()

        bea = bea[~(bea.isnull().sum(axis=1) >= 11)].copy()

        bea = bea.iloc[:-2].copy()

        bea = bea[bea['AFFILIATE_COUNTRY_NAME'] != 'Latin America and Other Western Hemisphere'].copy()

        # We re-index the DataFrame after having filtered out inappropriate rows
        bea.reset_index(inplace=True, drop=True)

        # Due to the organisation of the Excel file, the DataFrame contains rows that only display the name of the con-
        # tinent associated with countries below; we want to eliminate these rows and reconstitute a "one-block" dataset
        continent_names = [
            'Europe',
            'South America',
            'Central America',
            'Other Western Hemisphere',
            'Africa',
            'Middle East',
            'Asia and Pacific',
        ]

        # We fetch the list of the indices of these rows
        total_indices = list(
            bea[
                bea['AFFILIATE_COUNTRY_NAME'].isin(continent_names)
            ].index
        )

        # We will store the sub-DataFrames associated with each continent in a dedicated dictionary
        continent_extracts = {}

        for i, continent_name in enumerate(continent_names):
            if i + 1 < len(total_indices):
                continent_df = bea.loc[total_indices[i]:total_indices[i + 1] - 1].copy()

            else:
                continent_df = bea.loc[total_indices[i]:bea.index[-1]].copy()

            # In each sub-DataFrame, we rename the "Other" row as "Other [+ CONTINENT NAME]"
            continent_df['AFFILIATE_COUNTRY_NAME'] = continent_df['AFFILIATE_COUNTRY_NAME'].map(
                lambda country_name: country_name if country_name != 'Other' else 'Other ' + continent_name
            )

            continent_df = continent_df[continent_df['AFFILIATE_COUNTRY_NAME'] != continent_name].copy()

            continent_extracts[continent_name] = continent_df.copy()

        # The Canada row is outside any continent block
        bea_cleaned = bea[bea['AFFILIATE_COUNTRY_NAME'] == 'Canada'].copy()

        # Upon it, we stack the different continent blocks to obtain one "continuous" dataset
        for continent_extract in continent_extracts.values():
            bea_cleaned = pd.concat([bea_cleaned, continent_extract], axis=0)

        # We eventually reformat missing values
        for column in bea_cleaned.columns[1:]:
            bea_cleaned[column] = bea_cleaned[column].map(
                lambda x: np.nan if x == '(D)' else x
            )

        return bea_cleaned.reset_index(drop=True)

    def load_data_with_geo_codes(self):
        """
        This class method is used to add geographical ISO codes to the raw dataset, loaded with the "load_data" method.
        It relies on the "impute_missing_codes" function defined in utils.py.
        """
        bea = self.load_data()

        geographies = pd.read_csv(self.path_to_geo_file)

        # We merge the DataFrame containing raw BEA data with the one containing the ISO code correspondences
        merged_df = bea.merge(
            geographies,
            how='left',
            left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME'
        )

        # We add missing codes
        for column in ['NAME', 'CODE', 'CONTINENT_NAME', 'CONTINENT_CODE']:
            merged_df[column] = merged_df.apply(
                lambda row: impute_missing_codes(
                    row=row,
                    column=column,
                    codes_to_impute=self.CODES_TO_IMPUTE
                ),
                axis=1
            )

        # We don't consider "Other" aggregates as they are not the same as in the IRS data
        merged_df = merged_df[~merged_df['CODE'].isnull()].copy()
        merged_df = merged_df[merged_df['CODE'].map(len) <= 3].copy()

        return merged_df.copy()

    def load_final_data(self):
        """
        This class method allows to load the fully preprocessed BEA data. Relying on the "load_data_with_geo_codes" me-
        thod, continent names and codes are limited to 4 pairs, corresponding respectively to Europe, Africa, America
        (North and South) and Asia-Pacific (gathering Asia and Oceania).
        """
        bea = self.load_data_with_geo_codes()

        bea['CONTINENT_CODE'] = bea['CONTINENT_CODE'].map(
            lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
        )

        bea['CONTINENT_NAME'] = bea['CONTINENT_NAME'].map(
            lambda x: 'America' if x in ['South America', 'North America'] else x
        )

        bea['CONTINENT_CODE'] = bea['CONTINENT_CODE'].map(
            lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
        )

        bea['CONTINENT_NAME'] = bea['CONTINENT_NAME'].map(
            lambda x: 'Asia-Pacific' if x in ['Asia', 'Oceania'] or x is None else x
        )

        return bea.copy()

Methods

def load_data(self)

This class method is used to load and clean the data from the BEA. It relies on the data file paths, saved as class attributes when the instantiation function is called. Preprocessing steps are detailed in comments below.

Expand source code

def load_data(self):
    """
    This class method is used to load and clean the data from the BEA. It relies on the data file paths, saved as
    class attributes when the instantiation function is called. Preprocessing steps are detailed in comments below.
    """

    # We load the data from the appropriate Excel file
    bea = pd.read_excel(self.path_to_bea, sheet_name='Table II.E 2')

    # We rename columns, the following column names being used throughout the code
    bea.columns = [
        'AFFILIATE_COUNTRY_NAME', 'TOTAL', 'TOTAL_US', 'TOTAL_US_RELATED', 'TOTAL_US_UNRELATED', 'TOTAL_FOREIGN',
        'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_AFFILIATE_COUNTRY_RELATED', 'TOTAL_AFFILIATE_COUNTRY_UNRELATED',
        'TOTAL_OTHER_COUNTRY', 'TOTAL_OTHER_COUNTRY_RELATED', 'TOTAL_OTHER_COUNTRY_UNRELATED'
    ]

    # We only keep relevant rows
    bea = bea.loc[8:].copy()

    bea = bea[~(bea.isnull().sum(axis=1) >= 11)].copy()

    bea = bea.iloc[:-2].copy()

    bea = bea[bea['AFFILIATE_COUNTRY_NAME'] != 'Latin America and Other Western Hemisphere'].copy()

    # We re-index the DataFrame after having filtered out inappropriate rows
    bea.reset_index(inplace=True, drop=True)

    # Due to the organisation of the Excel file, the DataFrame contains rows that only display the name of the con-
    # tinent associated with countries below; we want to eliminate these rows and reconstitute a "one-block" dataset
    continent_names = [
        'Europe',
        'South America',
        'Central America',
        'Other Western Hemisphere',
        'Africa',
        'Middle East',
        'Asia and Pacific',
    ]

    # We fetch the list of the indices of these rows
    total_indices = list(
        bea[
            bea['AFFILIATE_COUNTRY_NAME'].isin(continent_names)
        ].index
    )

    # We will store the sub-DataFrames associated with each continent in a dedicated dictionary
    continent_extracts = {}

    for i, continent_name in enumerate(continent_names):
        if i + 1 < len(total_indices):
            continent_df = bea.loc[total_indices[i]:total_indices[i + 1] - 1].copy()

        else:
            continent_df = bea.loc[total_indices[i]:bea.index[-1]].copy()

        # In each sub-DataFrame, we rename the "Other" row as "Other [+ CONTINENT NAME]"
        continent_df['AFFILIATE_COUNTRY_NAME'] = continent_df['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: country_name if country_name != 'Other' else 'Other ' + continent_name
        )

        continent_df = continent_df[continent_df['AFFILIATE_COUNTRY_NAME'] != continent_name].copy()

        continent_extracts[continent_name] = continent_df.copy()

    # The Canada row is outside any continent block
    bea_cleaned = bea[bea['AFFILIATE_COUNTRY_NAME'] == 'Canada'].copy()

    # Upon it, we stack the different continent blocks to obtain one "continuous" dataset
    for continent_extract in continent_extracts.values():
        bea_cleaned = pd.concat([bea_cleaned, continent_extract], axis=0)

    # We eventually reformat missing values
    for column in bea_cleaned.columns[1:]:
        bea_cleaned[column] = bea_cleaned[column].map(
            lambda x: np.nan if x == '(D)' else x
        )

    return bea_cleaned.reset_index(drop=True)

def load_data_with_geo_codes(self)

This class method is used to add geographical ISO codes to the raw dataset, loaded with the "load_data" method. It relies on the "impute_missing_codes" function defined in utils.py.

Expand source code

def load_data_with_geo_codes(self):
    """
    This class method is used to add geographical ISO codes to the raw dataset, loaded with the "load_data" method.
    It relies on the "impute_missing_codes" function defined in utils.py.
    """
    bea = self.load_data()

    geographies = pd.read_csv(self.path_to_geo_file)

    # We merge the DataFrame containing raw BEA data with the one containing the ISO code correspondences
    merged_df = bea.merge(
        geographies,
        how='left',
        left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME'
    )

    # We add missing codes
    for column in ['NAME', 'CODE', 'CONTINENT_NAME', 'CONTINENT_CODE']:
        merged_df[column] = merged_df.apply(
            lambda row: impute_missing_codes(
                row=row,
                column=column,
                codes_to_impute=self.CODES_TO_IMPUTE
            ),
            axis=1
        )

    # We don't consider "Other" aggregates as they are not the same as in the IRS data
    merged_df = merged_df[~merged_df['CODE'].isnull()].copy()
    merged_df = merged_df[merged_df['CODE'].map(len) <= 3].copy()

    return merged_df.copy()

def load_final_data(self)

This class method allows to load the fully preprocessed BEA data. Relying on the "load_data_with_geo_codes" me- thod, continent names and codes are limited to 4 pairs, corresponding respectively to Europe, Africa, America (North and South) and Asia-Pacific (gathering Asia and Oceania).

Expand source code

def load_final_data(self):
    """
    This class method allows to load the fully preprocessed BEA data. Relying on the "load_data_with_geo_codes" me-
    thod, continent names and codes are limited to 4 pairs, corresponding respectively to Europe, Africa, America
    (North and South) and Asia-Pacific (gathering Asia and Oceania).
    """
    bea = self.load_data_with_geo_codes()

    bea['CONTINENT_CODE'] = bea['CONTINENT_CODE'].map(
        lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
    )

    bea['CONTINENT_NAME'] = bea['CONTINENT_NAME'].map(
        lambda x: 'America' if x in ['South America', 'North America'] else x
    )

    bea['CONTINENT_CODE'] = bea['CONTINENT_CODE'].map(
        lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
    )

    bea['CONTINENT_NAME'] = bea['CONTINENT_NAME'].map(
        lambda x: 'Asia-Pacific' if x in ['Asia', 'Oceania'] or x is None else x
    )

    return bea.copy()