Module destination_based_sales.per_industry
This module is dedicated to the industry-specific analyses described in Section 4.a. of the PDF report of August 2021. Indeed, the US Internal Revenue Service (IRS) provides a breakdown of its country-by-country statistics based on the main sector of activity of the parent company of the multinational group. We use it to highlight the differences, from an industry to another, in the concentration of foreign sales in tax havens.
Expand source code
"""
This module is dedicated to the industry-specific analyses described in Section 4.a. of the PDF report of August 2021.
Indeed, the US Internal Revenue Service (IRS) provides a breakdown of its country-by-country statistics based on the
main sector of activity of the parent company of the multinational group. We use it to highlight the differences, from
an industry to another, in the concentration of foreign sales in tax havens.
"""
########################################################################################################################
# --- Imports
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from destination_based_sales.utils import CONTINENT_CODES_TO_IMPUTE_TRADE
########################################################################################################################
# --- Diverse
path_to_dir = os.path.dirname(os.path.abspath(__file__))
path_to_GNI_data = os.path.join(path_to_dir, 'data', 'gross_national_income.csv')
path_to_geographies = os.path.join(path_to_dir, 'data', 'geographies.csv')
path_to_tax_haven_list = os.path.join(path_to_dir, 'data', 'tax_havens.csv')
path_to_industry_names_mapping = os.path.join(path_to_dir, 'data', 'industry_names_mapping.json')
with open(path_to_industry_names_mapping) as file:
industry_names_mapping = json.loads(file.read())
########################################################################################################################
# --- Content
class PerIndustryAnalyser:
def __init__(
self,
year,
path_to_dir=path_to_dir,
path_to_tax_haven_list=path_to_tax_haven_list,
path_to_geographies=path_to_geographies
):
"""
The logic for loading, preprocessing and analysing the industry-specific country-by-country statistics of the
IRS is encapsulated in a Python class, PerIndustryAnalyser. This is the instantiation function of this class,
which takes as arguments:
- the year to consider;
- the string path to the directory where this Python file is located;
- the string path to the list of tax havens;
- the string path to the "geographies.csv" file.
"""
if year not in [2016, 2017, 2018]:
raise Exception('For now, only the financial years from 2016 to 2018 (included) are covered.')
self.year = year
# We load the list of tax havens in a dedicated attribute
self.path_to_tax_haven_list = path_to_tax_haven_list
self.tax_havens = pd.read_csv(self.path_to_tax_haven_list)
self.path_to_dir = path_to_dir
self.path_to_geographies = path_to_geographies
def load_clean_data(
self,
exclude_all_jurisdictions=True
):
"""
This function allows to load and preprocess the industry-specific country-by-country data of the IRS.
It takes as argument a boolean, "exclude_all_jurisdictions", that determines whether or not to exclude the in-
dustry-level totals from the dataset. These are characterised by "All jurisdictions" as a partner country.
"""
# Loading the data from the corresponding Excel file
path_to_industry_data = os.path.join(
self.path_to_dir,
'data',
str(self.year),
f'{self.year - 2000}it02cbc.xlsx'
)
data = pd.read_excel(
path_to_industry_data,
engine='openpyxl'
)
# Eliminating irrelevant columns and rows
data = data[data.columns[:6]].copy()
data.columns = [
'INDUSTRY',
'AFFILIATE_COUNTRY_NAME',
'NB_REPORTING_MNEs',
'UNRELATED_PARTY_REVENUES',
'RELATED_PARTY_REVENUES',
'TOTAL_REVENUES'
]
data = data[
data.isnull().sum(axis=1) != len(data.columns)
].copy()
data = data.iloc[4:-7].copy()
data.reset_index(drop=True, inplace=True)
# Adding the right industry name to each observation
industry_indices = list(data[~data['INDUSTRY'].isnull()].index)
industries = {}
for i in range(len(industry_indices)):
if i < len(industry_indices) - 1:
restricted_df = data.loc[industry_indices[i]:industry_indices[i + 1] - 1].copy()
else:
restricted_df = data.loc[industry_indices[i]:].copy()
industry = restricted_df['INDUSTRY'].iloc[0]
restricted_df['INDUSTRY'] = industry
industries[industry] = restricted_df.copy()
data = industries[list(industries.keys())[0]].copy()
for key, value in industries.items():
if key == list(industries.keys())[0]:
continue
data = pd.concat([data, value], axis=0)
# Eliminating irrelevant observations
if exclude_all_jurisdictions:
data = data[
~data['AFFILIATE_COUNTRY_NAME'].isin(['All jurisdictions', 'Stateless entities and other country'])
].copy()
else:
data = data[
data['AFFILIATE_COUNTRY_NAME'] != 'Stateless entities and other country'
].copy()
data = data[
~data['AFFILIATE_COUNTRY_NAME'].map(
lambda country_name: 'total' in country_name.lower()
)
].copy()
data = data[
data.drop(columns=['NB_REPORTING_MNEs']).applymap(
lambda x: isinstance(x, str) and x == 'd'
).sum(axis=1) == 0
].copy()
# Renaming continental aggregates and a few specific partner jurisdictions
data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map(
(
lambda country_name: f'Other {country_name.split(",")[0].replace("&", "and")}'
if 'other' in country_name.lower() else country_name
)
)
data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map(
lambda country_name: 'United Kingdom' if 'United Kingdom' in country_name else country_name
)
data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map(
lambda country_name: 'Korea' if country_name.startswith('Korea') else country_name
)
data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map(
lambda country_name: 'Congo' if country_name.endswith('(Brazzaville)') else country_name
)
# Adding alpha-3 country codes
geographies = pd.read_csv(self.path_to_geographies)
data = data.merge(
geographies[['NAME', 'CODE']],
how='left',
left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME'
)
data.drop(columns=['NAME'], inplace=True)
data['CODE'] = data.apply(
(
lambda row: 'OASIAOCN' if isinstance(row['CODE'], float) and np.isnan(row['CODE'])
and row['AFFILIATE_COUNTRY_NAME'] == 'Other Asia and Oceania' else row['CODE']
),
axis=1
)
data.rename(
columns={
'CODE': 'AFFILIATE_COUNTRY_CODE'
},
inplace=True
)
data.reset_index(drop=True, inplace=True)
# Renaming industries for convenience
data['INDUSTRY'] = data['INDUSTRY'].map(
lambda industry: industry_names_mapping.get(industry, industry)
)
return data.copy()
def load_data_with_GNI(self, dropna=False, path_to_GNI_data=path_to_GNI_data):
"""
Building upon the previous method, "load_clean_data", this method allows to load and preprocess the industry-
specific country-by-country data while adding the Gross National Income (GNI) of each partner country, for the
corresponding year. It takes two arguments:
- a boolean, "dropna", indicating whether or not to exclude the partner countries for which we lack the GNI;
- the string path to the file containing GNI data.
"""
# Loading and cleaning industry-specific country-by-country data
data = self.load_clean_data()
# Loading and preprocessing Gross National Income (GNI) data
gross_national_income = pd.read_csv(path_to_GNI_data, delimiter=';')
gross_national_income = gross_national_income[['COUNTRY_CODE', f'GNI_{self.year}']].copy()
gross_national_income[f'GNI_{self.year}'] = gross_national_income[f'GNI_{self.year}'].map(
lambda x: x.replace(',', '.') if isinstance(x, str) else x
).astype(float)
# Merging the two datasets on partner country codes
data = data.merge(
gross_national_income,
how='left',
left_on='AFFILIATE_COUNTRY_CODE', right_on='COUNTRY_CODE'
)
data.drop(columns=['COUNTRY_CODE'], inplace=True)
if dropna:
data.dropna(inplace=True)
return data.copy()
def get_industry_overview_table(self, output_excel=True):
"""
This method allows to output the industry overview table that shows, for each year in the sample period, the
distribution of US total unrelated-party revenues and foreign unrelated-party revenues between industries. It
corresponds to Table 3 in the PDF report of August 2021. The boolean argument, "output_excel", determines
whether to save the table in an Excel file (change the target file path before using this method with
"output_excel=True").
"""
final_output = {}
for year in [2016, 2017, 2018]:
# We instantiate an industry-specific analyser for each year
analyser = PerIndustryAnalyser(year=year)
# Loading the data
data = analyser.load_clean_data(exclude_all_jurisdictions=False)
# Focusing on industry totals and US-US rows
data = data[data['AFFILIATE_COUNTRY_NAME'].isin(['All jurisdictions', 'United States'])].copy()
# Eliminating irrelevant columns
data.drop(
columns=[
'AFFILIATE_COUNTRY_CODE', 'NB_REPORTING_MNEs',
'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES'
],
inplace=True
)
# We pivot the DataFrame to show the revenues of each industry in all jurisdictions and in the US
df = data.pivot(
index='INDUSTRY',
columns='AFFILIATE_COUNTRY_NAME',
values='UNRELATED_PARTY_REVENUES'
).reset_index()
# Foreign unrelated-party revenues simply correspond to the total minus the US-US revenues
df['FOREIGN_UPR'] = df['All jurisdictions'] - df['United States']
# We move from absolute amounts to shares / a distribution
df['Share of total unrelated-party revenues (%)'] = (
df['All jurisdictions'] / df['All jurisdictions'].sum() * 100
)
df['Share of foreign unrelated-party revenues (%)'] = df['FOREIGN_UPR'] / df['FOREIGN_UPR'].sum() * 100
df.drop(columns=['All jurisdictions', 'United States', 'FOREIGN_UPR'], inplace=True)
# Ranking industries based on decreasing importance in the distribution
df.sort_values(by='Share of foreign unrelated-party revenues (%)', ascending=False, inplace=True)
final_output[year] = df.copy()
# Outputting the Excel file if relevant
if output_excel:
path_to_excel_file = '/Users/Paul-Emmanuel/Desktop/industry_overview_table_PYTHON_OUTPUT.xlsx'
with pd.ExcelWriter(path_to_excel_file, engine='xlsxwriter') as writer:
for key, value in final_output.items():
value.to_excel(writer, sheet_name=str(key), index=False)
return final_output.copy()
def plot_industry_specific_charts(self, save_PNG=False, path_to_folder=None):
"""
This method allows to output the graphs that show the relationship between partner jurisdictions’ share of US
multinational companies’ foreign unrelated-party revenues and their share of Gross National Income (GNI),
broken down by industry group. These correspond to Figure E.1 of the PDF report of August 2021.
The method takes two arguments used to save the output charts in a PNG file. This requires to set the boolean
"save_PNG" to True and to pass, in "path_to_folder", the string path to the target folder.
"""
# Setting Matplotlib parameters
plt.rcParams.update({'font.size': 18})
if save_PNG and path_to_folder is None:
raise Exception('To save the figure as a PNG, you must indicate the target folder as an argument.')
# Loading cleaned data with GNI data (eliminating rows for which we have no GNI data)
data = self.load_data_with_GNI(dropna=True)
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(25, 40))
# Figure displays one graph per industry group
for industry, ax in zip(data['INDUSTRY'].unique(), axes.flatten()):
# Restricting the dataset to the industry group under consideration and excluding the US-US row
restricted_df = data[
np.logical_and(
data['INDUSTRY'] == industry,
data['AFFILIATE_COUNTRY_CODE'] != 'USA'
)
].copy()
# Computing each partner country's share of US, industry-specific foreign unrelated-party revenues
restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'] = (
restricted_df['UNRELATED_PARTY_REVENUES'].astype(float) /
restricted_df['UNRELATED_PARTY_REVENUES'].sum()
)
# Computing each partner country's share of GNI
restricted_df[f'SHARE_OF_GNI_{self.year}'] = (
restricted_df[f'GNI_{self.year}'] / restricted_df[f'GNI_{self.year}'].sum()
)
# Computing the correlation between the two shares for the industry under consideration
correlation = np.corrcoef(
restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'],
restricted_df[f'SHARE_OF_GNI_{self.year}']
)[1, 0]
# Distinguishing non-havens, tax havens and NAFTA members
restricted_df['Category'] = (
restricted_df['AFFILIATE_COUNTRY_CODE'].isin(self.tax_havens['CODE'].unique()) * 1
+ restricted_df['AFFILIATE_COUNTRY_CODE'].isin(['CAN', 'MEX']) * 2
)
restricted_df['Category'] = restricted_df['Category'].map({0: 'Other', 1: 'Tax haven', 2: 'NAFTA member'})
restricted_df.rename(
columns={
f'SHARE_OF_GNI_{self.year}': f'Share of total {self.year} GNI (%)',
'SHARE_OF_UNRELATED_PARTY_REVENUES': 'Share of total unrelated-party revenues (%)'
},
inplace=True
)
# Building the graph with the indicative regression line and the scattered plot
sns.regplot(
x=f'Share of total {self.year} GNI (%)',
y='Share of total unrelated-party revenues (%)',
data=restricted_df,
ci=None,
ax=ax
)
sns.scatterplot(
x=f'Share of total {self.year} GNI (%)',
y='Share of total unrelated-party revenues (%)',
data=restricted_df,
hue='Category',
palette={
'Other': 'darkblue', 'Tax haven': 'darkred', 'NAFTA member': 'darkgreen'
},
s=80,
ax=ax
)
# Title indicating the industry being considered and the correlation between the share of foreign unrelated-
# party revenues and the share of GNI
ax.set_title(f'{industry} - Correlation of {round(correlation, 2)}')
plt.show()
# Saving the figure into a PNG file if relevant
if save_PNG:
fig.savefig(
os.path.join(
path_to_folder,
f'industry_specific_charts_{self.year}.png'
)
)
Classes
class PerIndustryAnalyser (year, path_to_dir='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales', path_to_tax_haven_list='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales/data/tax_havens.csv', path_to_geographies='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales/data/geographies.csv')
-
The logic for loading, preprocessing and analysing the industry-specific country-by-country statistics of the IRS is encapsulated in a Python class, PerIndustryAnalyser. This is the instantiation function of this class, which takes as arguments:
- the year to consider;
- the string path to the directory where this Python file is located;
- the string path to the list of tax havens;
- the string path to the "geographies.csv" file.
Expand source code
class PerIndustryAnalyser: def __init__( self, year, path_to_dir=path_to_dir, path_to_tax_haven_list=path_to_tax_haven_list, path_to_geographies=path_to_geographies ): """ The logic for loading, preprocessing and analysing the industry-specific country-by-country statistics of the IRS is encapsulated in a Python class, PerIndustryAnalyser. This is the instantiation function of this class, which takes as arguments: - the year to consider; - the string path to the directory where this Python file is located; - the string path to the list of tax havens; - the string path to the "geographies.csv" file. """ if year not in [2016, 2017, 2018]: raise Exception('For now, only the financial years from 2016 to 2018 (included) are covered.') self.year = year # We load the list of tax havens in a dedicated attribute self.path_to_tax_haven_list = path_to_tax_haven_list self.tax_havens = pd.read_csv(self.path_to_tax_haven_list) self.path_to_dir = path_to_dir self.path_to_geographies = path_to_geographies def load_clean_data( self, exclude_all_jurisdictions=True ): """ This function allows to load and preprocess the industry-specific country-by-country data of the IRS. It takes as argument a boolean, "exclude_all_jurisdictions", that determines whether or not to exclude the in- dustry-level totals from the dataset. These are characterised by "All jurisdictions" as a partner country. """ # Loading the data from the corresponding Excel file path_to_industry_data = os.path.join( self.path_to_dir, 'data', str(self.year), f'{self.year - 2000}it02cbc.xlsx' ) data = pd.read_excel( path_to_industry_data, engine='openpyxl' ) # Eliminating irrelevant columns and rows data = data[data.columns[:6]].copy() data.columns = [ 'INDUSTRY', 'AFFILIATE_COUNTRY_NAME', 'NB_REPORTING_MNEs', 'UNRELATED_PARTY_REVENUES', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES' ] data = data[ data.isnull().sum(axis=1) != len(data.columns) ].copy() data = data.iloc[4:-7].copy() data.reset_index(drop=True, inplace=True) # Adding the right industry name to each observation industry_indices = list(data[~data['INDUSTRY'].isnull()].index) industries = {} for i in range(len(industry_indices)): if i < len(industry_indices) - 1: restricted_df = data.loc[industry_indices[i]:industry_indices[i + 1] - 1].copy() else: restricted_df = data.loc[industry_indices[i]:].copy() industry = restricted_df['INDUSTRY'].iloc[0] restricted_df['INDUSTRY'] = industry industries[industry] = restricted_df.copy() data = industries[list(industries.keys())[0]].copy() for key, value in industries.items(): if key == list(industries.keys())[0]: continue data = pd.concat([data, value], axis=0) # Eliminating irrelevant observations if exclude_all_jurisdictions: data = data[ ~data['AFFILIATE_COUNTRY_NAME'].isin(['All jurisdictions', 'Stateless entities and other country']) ].copy() else: data = data[ data['AFFILIATE_COUNTRY_NAME'] != 'Stateless entities and other country' ].copy() data = data[ ~data['AFFILIATE_COUNTRY_NAME'].map( lambda country_name: 'total' in country_name.lower() ) ].copy() data = data[ data.drop(columns=['NB_REPORTING_MNEs']).applymap( lambda x: isinstance(x, str) and x == 'd' ).sum(axis=1) == 0 ].copy() # Renaming continental aggregates and a few specific partner jurisdictions data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map( ( lambda country_name: f'Other {country_name.split(",")[0].replace("&", "and")}' if 'other' in country_name.lower() else country_name ) ) data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map( lambda country_name: 'United Kingdom' if 'United Kingdom' in country_name else country_name ) data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map( lambda country_name: 'Korea' if country_name.startswith('Korea') else country_name ) data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map( lambda country_name: 'Congo' if country_name.endswith('(Brazzaville)') else country_name ) # Adding alpha-3 country codes geographies = pd.read_csv(self.path_to_geographies) data = data.merge( geographies[['NAME', 'CODE']], how='left', left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME' ) data.drop(columns=['NAME'], inplace=True) data['CODE'] = data.apply( ( lambda row: 'OASIAOCN' if isinstance(row['CODE'], float) and np.isnan(row['CODE']) and row['AFFILIATE_COUNTRY_NAME'] == 'Other Asia and Oceania' else row['CODE'] ), axis=1 ) data.rename( columns={ 'CODE': 'AFFILIATE_COUNTRY_CODE' }, inplace=True ) data.reset_index(drop=True, inplace=True) # Renaming industries for convenience data['INDUSTRY'] = data['INDUSTRY'].map( lambda industry: industry_names_mapping.get(industry, industry) ) return data.copy() def load_data_with_GNI(self, dropna=False, path_to_GNI_data=path_to_GNI_data): """ Building upon the previous method, "load_clean_data", this method allows to load and preprocess the industry- specific country-by-country data while adding the Gross National Income (GNI) of each partner country, for the corresponding year. It takes two arguments: - a boolean, "dropna", indicating whether or not to exclude the partner countries for which we lack the GNI; - the string path to the file containing GNI data. """ # Loading and cleaning industry-specific country-by-country data data = self.load_clean_data() # Loading and preprocessing Gross National Income (GNI) data gross_national_income = pd.read_csv(path_to_GNI_data, delimiter=';') gross_national_income = gross_national_income[['COUNTRY_CODE', f'GNI_{self.year}']].copy() gross_national_income[f'GNI_{self.year}'] = gross_national_income[f'GNI_{self.year}'].map( lambda x: x.replace(',', '.') if isinstance(x, str) else x ).astype(float) # Merging the two datasets on partner country codes data = data.merge( gross_national_income, how='left', left_on='AFFILIATE_COUNTRY_CODE', right_on='COUNTRY_CODE' ) data.drop(columns=['COUNTRY_CODE'], inplace=True) if dropna: data.dropna(inplace=True) return data.copy() def get_industry_overview_table(self, output_excel=True): """ This method allows to output the industry overview table that shows, for each year in the sample period, the distribution of US total unrelated-party revenues and foreign unrelated-party revenues between industries. It corresponds to Table 3 in the PDF report of August 2021. The boolean argument, "output_excel", determines whether to save the table in an Excel file (change the target file path before using this method with "output_excel=True"). """ final_output = {} for year in [2016, 2017, 2018]: # We instantiate an industry-specific analyser for each year analyser = PerIndustryAnalyser(year=year) # Loading the data data = analyser.load_clean_data(exclude_all_jurisdictions=False) # Focusing on industry totals and US-US rows data = data[data['AFFILIATE_COUNTRY_NAME'].isin(['All jurisdictions', 'United States'])].copy() # Eliminating irrelevant columns data.drop( columns=[ 'AFFILIATE_COUNTRY_CODE', 'NB_REPORTING_MNEs', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES' ], inplace=True ) # We pivot the DataFrame to show the revenues of each industry in all jurisdictions and in the US df = data.pivot( index='INDUSTRY', columns='AFFILIATE_COUNTRY_NAME', values='UNRELATED_PARTY_REVENUES' ).reset_index() # Foreign unrelated-party revenues simply correspond to the total minus the US-US revenues df['FOREIGN_UPR'] = df['All jurisdictions'] - df['United States'] # We move from absolute amounts to shares / a distribution df['Share of total unrelated-party revenues (%)'] = ( df['All jurisdictions'] / df['All jurisdictions'].sum() * 100 ) df['Share of foreign unrelated-party revenues (%)'] = df['FOREIGN_UPR'] / df['FOREIGN_UPR'].sum() * 100 df.drop(columns=['All jurisdictions', 'United States', 'FOREIGN_UPR'], inplace=True) # Ranking industries based on decreasing importance in the distribution df.sort_values(by='Share of foreign unrelated-party revenues (%)', ascending=False, inplace=True) final_output[year] = df.copy() # Outputting the Excel file if relevant if output_excel: path_to_excel_file = '/Users/Paul-Emmanuel/Desktop/industry_overview_table_PYTHON_OUTPUT.xlsx' with pd.ExcelWriter(path_to_excel_file, engine='xlsxwriter') as writer: for key, value in final_output.items(): value.to_excel(writer, sheet_name=str(key), index=False) return final_output.copy() def plot_industry_specific_charts(self, save_PNG=False, path_to_folder=None): """ This method allows to output the graphs that show the relationship between partner jurisdictions’ share of US multinational companies’ foreign unrelated-party revenues and their share of Gross National Income (GNI), broken down by industry group. These correspond to Figure E.1 of the PDF report of August 2021. The method takes two arguments used to save the output charts in a PNG file. This requires to set the boolean "save_PNG" to True and to pass, in "path_to_folder", the string path to the target folder. """ # Setting Matplotlib parameters plt.rcParams.update({'font.size': 18}) if save_PNG and path_to_folder is None: raise Exception('To save the figure as a PNG, you must indicate the target folder as an argument.') # Loading cleaned data with GNI data (eliminating rows for which we have no GNI data) data = self.load_data_with_GNI(dropna=True) fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(25, 40)) # Figure displays one graph per industry group for industry, ax in zip(data['INDUSTRY'].unique(), axes.flatten()): # Restricting the dataset to the industry group under consideration and excluding the US-US row restricted_df = data[ np.logical_and( data['INDUSTRY'] == industry, data['AFFILIATE_COUNTRY_CODE'] != 'USA' ) ].copy() # Computing each partner country's share of US, industry-specific foreign unrelated-party revenues restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'] = ( restricted_df['UNRELATED_PARTY_REVENUES'].astype(float) / restricted_df['UNRELATED_PARTY_REVENUES'].sum() ) # Computing each partner country's share of GNI restricted_df[f'SHARE_OF_GNI_{self.year}'] = ( restricted_df[f'GNI_{self.year}'] / restricted_df[f'GNI_{self.year}'].sum() ) # Computing the correlation between the two shares for the industry under consideration correlation = np.corrcoef( restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'], restricted_df[f'SHARE_OF_GNI_{self.year}'] )[1, 0] # Distinguishing non-havens, tax havens and NAFTA members restricted_df['Category'] = ( restricted_df['AFFILIATE_COUNTRY_CODE'].isin(self.tax_havens['CODE'].unique()) * 1 + restricted_df['AFFILIATE_COUNTRY_CODE'].isin(['CAN', 'MEX']) * 2 ) restricted_df['Category'] = restricted_df['Category'].map({0: 'Other', 1: 'Tax haven', 2: 'NAFTA member'}) restricted_df.rename( columns={ f'SHARE_OF_GNI_{self.year}': f'Share of total {self.year} GNI (%)', 'SHARE_OF_UNRELATED_PARTY_REVENUES': 'Share of total unrelated-party revenues (%)' }, inplace=True ) # Building the graph with the indicative regression line and the scattered plot sns.regplot( x=f'Share of total {self.year} GNI (%)', y='Share of total unrelated-party revenues (%)', data=restricted_df, ci=None, ax=ax ) sns.scatterplot( x=f'Share of total {self.year} GNI (%)', y='Share of total unrelated-party revenues (%)', data=restricted_df, hue='Category', palette={ 'Other': 'darkblue', 'Tax haven': 'darkred', 'NAFTA member': 'darkgreen' }, s=80, ax=ax ) # Title indicating the industry being considered and the correlation between the share of foreign unrelated- # party revenues and the share of GNI ax.set_title(f'{industry} - Correlation of {round(correlation, 2)}') plt.show() # Saving the figure into a PNG file if relevant if save_PNG: fig.savefig( os.path.join( path_to_folder, f'industry_specific_charts_{self.year}.png' ) )
Methods
def get_industry_overview_table(self, output_excel=True)
-
This method allows to output the industry overview table that shows, for each year in the sample period, the distribution of US total unrelated-party revenues and foreign unrelated-party revenues between industries. It corresponds to Table 3 in the PDF report of August 2021. The boolean argument, "output_excel", determines whether to save the table in an Excel file (change the target file path before using this method with "output_excel=True").
Expand source code
def get_industry_overview_table(self, output_excel=True): """ This method allows to output the industry overview table that shows, for each year in the sample period, the distribution of US total unrelated-party revenues and foreign unrelated-party revenues between industries. It corresponds to Table 3 in the PDF report of August 2021. The boolean argument, "output_excel", determines whether to save the table in an Excel file (change the target file path before using this method with "output_excel=True"). """ final_output = {} for year in [2016, 2017, 2018]: # We instantiate an industry-specific analyser for each year analyser = PerIndustryAnalyser(year=year) # Loading the data data = analyser.load_clean_data(exclude_all_jurisdictions=False) # Focusing on industry totals and US-US rows data = data[data['AFFILIATE_COUNTRY_NAME'].isin(['All jurisdictions', 'United States'])].copy() # Eliminating irrelevant columns data.drop( columns=[ 'AFFILIATE_COUNTRY_CODE', 'NB_REPORTING_MNEs', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES' ], inplace=True ) # We pivot the DataFrame to show the revenues of each industry in all jurisdictions and in the US df = data.pivot( index='INDUSTRY', columns='AFFILIATE_COUNTRY_NAME', values='UNRELATED_PARTY_REVENUES' ).reset_index() # Foreign unrelated-party revenues simply correspond to the total minus the US-US revenues df['FOREIGN_UPR'] = df['All jurisdictions'] - df['United States'] # We move from absolute amounts to shares / a distribution df['Share of total unrelated-party revenues (%)'] = ( df['All jurisdictions'] / df['All jurisdictions'].sum() * 100 ) df['Share of foreign unrelated-party revenues (%)'] = df['FOREIGN_UPR'] / df['FOREIGN_UPR'].sum() * 100 df.drop(columns=['All jurisdictions', 'United States', 'FOREIGN_UPR'], inplace=True) # Ranking industries based on decreasing importance in the distribution df.sort_values(by='Share of foreign unrelated-party revenues (%)', ascending=False, inplace=True) final_output[year] = df.copy() # Outputting the Excel file if relevant if output_excel: path_to_excel_file = '/Users/Paul-Emmanuel/Desktop/industry_overview_table_PYTHON_OUTPUT.xlsx' with pd.ExcelWriter(path_to_excel_file, engine='xlsxwriter') as writer: for key, value in final_output.items(): value.to_excel(writer, sheet_name=str(key), index=False) return final_output.copy()
def load_clean_data(self, exclude_all_jurisdictions=True)
-
This function allows to load and preprocess the industry-specific country-by-country data of the IRS.
It takes as argument a boolean, "exclude_all_jurisdictions", that determines whether or not to exclude the in- dustry-level totals from the dataset. These are characterised by "All jurisdictions" as a partner country.
Expand source code
def load_clean_data( self, exclude_all_jurisdictions=True ): """ This function allows to load and preprocess the industry-specific country-by-country data of the IRS. It takes as argument a boolean, "exclude_all_jurisdictions", that determines whether or not to exclude the in- dustry-level totals from the dataset. These are characterised by "All jurisdictions" as a partner country. """ # Loading the data from the corresponding Excel file path_to_industry_data = os.path.join( self.path_to_dir, 'data', str(self.year), f'{self.year - 2000}it02cbc.xlsx' ) data = pd.read_excel( path_to_industry_data, engine='openpyxl' ) # Eliminating irrelevant columns and rows data = data[data.columns[:6]].copy() data.columns = [ 'INDUSTRY', 'AFFILIATE_COUNTRY_NAME', 'NB_REPORTING_MNEs', 'UNRELATED_PARTY_REVENUES', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES' ] data = data[ data.isnull().sum(axis=1) != len(data.columns) ].copy() data = data.iloc[4:-7].copy() data.reset_index(drop=True, inplace=True) # Adding the right industry name to each observation industry_indices = list(data[~data['INDUSTRY'].isnull()].index) industries = {} for i in range(len(industry_indices)): if i < len(industry_indices) - 1: restricted_df = data.loc[industry_indices[i]:industry_indices[i + 1] - 1].copy() else: restricted_df = data.loc[industry_indices[i]:].copy() industry = restricted_df['INDUSTRY'].iloc[0] restricted_df['INDUSTRY'] = industry industries[industry] = restricted_df.copy() data = industries[list(industries.keys())[0]].copy() for key, value in industries.items(): if key == list(industries.keys())[0]: continue data = pd.concat([data, value], axis=0) # Eliminating irrelevant observations if exclude_all_jurisdictions: data = data[ ~data['AFFILIATE_COUNTRY_NAME'].isin(['All jurisdictions', 'Stateless entities and other country']) ].copy() else: data = data[ data['AFFILIATE_COUNTRY_NAME'] != 'Stateless entities and other country' ].copy() data = data[ ~data['AFFILIATE_COUNTRY_NAME'].map( lambda country_name: 'total' in country_name.lower() ) ].copy() data = data[ data.drop(columns=['NB_REPORTING_MNEs']).applymap( lambda x: isinstance(x, str) and x == 'd' ).sum(axis=1) == 0 ].copy() # Renaming continental aggregates and a few specific partner jurisdictions data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map( ( lambda country_name: f'Other {country_name.split(",")[0].replace("&", "and")}' if 'other' in country_name.lower() else country_name ) ) data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map( lambda country_name: 'United Kingdom' if 'United Kingdom' in country_name else country_name ) data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map( lambda country_name: 'Korea' if country_name.startswith('Korea') else country_name ) data['AFFILIATE_COUNTRY_NAME'] = data['AFFILIATE_COUNTRY_NAME'].map( lambda country_name: 'Congo' if country_name.endswith('(Brazzaville)') else country_name ) # Adding alpha-3 country codes geographies = pd.read_csv(self.path_to_geographies) data = data.merge( geographies[['NAME', 'CODE']], how='left', left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME' ) data.drop(columns=['NAME'], inplace=True) data['CODE'] = data.apply( ( lambda row: 'OASIAOCN' if isinstance(row['CODE'], float) and np.isnan(row['CODE']) and row['AFFILIATE_COUNTRY_NAME'] == 'Other Asia and Oceania' else row['CODE'] ), axis=1 ) data.rename( columns={ 'CODE': 'AFFILIATE_COUNTRY_CODE' }, inplace=True ) data.reset_index(drop=True, inplace=True) # Renaming industries for convenience data['INDUSTRY'] = data['INDUSTRY'].map( lambda industry: industry_names_mapping.get(industry, industry) ) return data.copy()
def load_data_with_GNI(self, dropna=False, path_to_GNI_data='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales/data/gross_national_income.csv')
-
Building upon the previous method, "load_clean_data", this method allows to load and preprocess the industry- specific country-by-country data while adding the Gross National Income (GNI) of each partner country, for the corresponding year. It takes two arguments:
- a boolean, "dropna", indicating whether or not to exclude the partner countries for which we lack the GNI;
- the string path to the file containing GNI data.
Expand source code
def load_data_with_GNI(self, dropna=False, path_to_GNI_data=path_to_GNI_data): """ Building upon the previous method, "load_clean_data", this method allows to load and preprocess the industry- specific country-by-country data while adding the Gross National Income (GNI) of each partner country, for the corresponding year. It takes two arguments: - a boolean, "dropna", indicating whether or not to exclude the partner countries for which we lack the GNI; - the string path to the file containing GNI data. """ # Loading and cleaning industry-specific country-by-country data data = self.load_clean_data() # Loading and preprocessing Gross National Income (GNI) data gross_national_income = pd.read_csv(path_to_GNI_data, delimiter=';') gross_national_income = gross_national_income[['COUNTRY_CODE', f'GNI_{self.year}']].copy() gross_national_income[f'GNI_{self.year}'] = gross_national_income[f'GNI_{self.year}'].map( lambda x: x.replace(',', '.') if isinstance(x, str) else x ).astype(float) # Merging the two datasets on partner country codes data = data.merge( gross_national_income, how='left', left_on='AFFILIATE_COUNTRY_CODE', right_on='COUNTRY_CODE' ) data.drop(columns=['COUNTRY_CODE'], inplace=True) if dropna: data.dropna(inplace=True) return data.copy()
def plot_industry_specific_charts(self, save_PNG=False, path_to_folder=None)
-
This method allows to output the graphs that show the relationship between partner jurisdictions’ share of US multinational companies’ foreign unrelated-party revenues and their share of Gross National Income (GNI), broken down by industry group. These correspond to Figure E.1 of the PDF report of August 2021.
The method takes two arguments used to save the output charts in a PNG file. This requires to set the boolean "save_PNG" to True and to pass, in "path_to_folder", the string path to the target folder.
Expand source code
def plot_industry_specific_charts(self, save_PNG=False, path_to_folder=None): """ This method allows to output the graphs that show the relationship between partner jurisdictions’ share of US multinational companies’ foreign unrelated-party revenues and their share of Gross National Income (GNI), broken down by industry group. These correspond to Figure E.1 of the PDF report of August 2021. The method takes two arguments used to save the output charts in a PNG file. This requires to set the boolean "save_PNG" to True and to pass, in "path_to_folder", the string path to the target folder. """ # Setting Matplotlib parameters plt.rcParams.update({'font.size': 18}) if save_PNG and path_to_folder is None: raise Exception('To save the figure as a PNG, you must indicate the target folder as an argument.') # Loading cleaned data with GNI data (eliminating rows for which we have no GNI data) data = self.load_data_with_GNI(dropna=True) fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(25, 40)) # Figure displays one graph per industry group for industry, ax in zip(data['INDUSTRY'].unique(), axes.flatten()): # Restricting the dataset to the industry group under consideration and excluding the US-US row restricted_df = data[ np.logical_and( data['INDUSTRY'] == industry, data['AFFILIATE_COUNTRY_CODE'] != 'USA' ) ].copy() # Computing each partner country's share of US, industry-specific foreign unrelated-party revenues restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'] = ( restricted_df['UNRELATED_PARTY_REVENUES'].astype(float) / restricted_df['UNRELATED_PARTY_REVENUES'].sum() ) # Computing each partner country's share of GNI restricted_df[f'SHARE_OF_GNI_{self.year}'] = ( restricted_df[f'GNI_{self.year}'] / restricted_df[f'GNI_{self.year}'].sum() ) # Computing the correlation between the two shares for the industry under consideration correlation = np.corrcoef( restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'], restricted_df[f'SHARE_OF_GNI_{self.year}'] )[1, 0] # Distinguishing non-havens, tax havens and NAFTA members restricted_df['Category'] = ( restricted_df['AFFILIATE_COUNTRY_CODE'].isin(self.tax_havens['CODE'].unique()) * 1 + restricted_df['AFFILIATE_COUNTRY_CODE'].isin(['CAN', 'MEX']) * 2 ) restricted_df['Category'] = restricted_df['Category'].map({0: 'Other', 1: 'Tax haven', 2: 'NAFTA member'}) restricted_df.rename( columns={ f'SHARE_OF_GNI_{self.year}': f'Share of total {self.year} GNI (%)', 'SHARE_OF_UNRELATED_PARTY_REVENUES': 'Share of total unrelated-party revenues (%)' }, inplace=True ) # Building the graph with the indicative regression line and the scattered plot sns.regplot( x=f'Share of total {self.year} GNI (%)', y='Share of total unrelated-party revenues (%)', data=restricted_df, ci=None, ax=ax ) sns.scatterplot( x=f'Share of total {self.year} GNI (%)', y='Share of total unrelated-party revenues (%)', data=restricted_df, hue='Category', palette={ 'Other': 'darkblue', 'Tax haven': 'darkred', 'NAFTA member': 'darkgreen' }, s=80, ax=ax ) # Title indicating the industry being considered and the correlation between the share of foreign unrelated- # party revenues and the share of GNI ax.set_title(f'{industry} - Correlation of {round(correlation, 2)}') plt.show() # Saving the figure into a PNG file if relevant if save_PNG: fig.savefig( os.path.join( path_to_folder, f'industry_specific_charts_{self.year}.png' ) )