# Importing Relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime as dt
import statsmodels as sm

# Stationarity testing
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
import ruptures as rpt

# Granger causality
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.api import VAR

# Cointegration testing
from statsmodels.tsa.stattools import coint

# Importing data
laborforce = pd.read_csv("data/laborforce.csv") 
totaljobs =  pd.read_csv("data/manufacturing_jobs.csv")
wages_manufacturing = pd.read_csv("data/manufacturing_wages.csv")
wages_all_sectors = pd.read_csv("data/wages_all_sectors.csv")
cpi = pd.read_csv("data/consumerpriceindex.csv")
gini = pd.read_csv("data/ginicoefficient.csv")

# Cleaning data
# Wages (manufacturing and all private sector)
adjusted_wages = pd.merge(wages_manufacturing, cpi, on="observation_date", how="inner")
adjusted_wages = pd.merge(adjusted_wages, wages_all_sectors, on="observation_date", how="inner")
adjusted_wages["real_wages_manufacturing"] = (adjusted_wages["CES3000000008"]/adjusted_wages["CPIAUCSL"])*100
adjusted_wages["real_wages_all"] = (adjusted_wages["AHETPI"]/adjusted_wages["CPIAUCSL"])*100
adjusted_wages["observation_date"] = pd.to_datetime(adjusted_wages["observation_date"])
adjusted_wages = adjusted_wages.dropna()

# Percent of workers in manufacturing
percent_manufacturing = pd.merge(totaljobs, laborforce, on="observation_date", how="inner")
percent_manufacturing["percent_manufacturing"] = percent_manufacturing["MANEMP"]/percent_manufacturing["CLF16OV"] 
percent_manufacturing["observation_date"] = pd.to_datetime(percent_manufacturing["observation_date"])
percent_manufacturing = percent_manufacturing.dropna()

# Interpolating gini
gini["observation_date"] = pd.to_datetime(gini["observation_date"])
gini["year"] = gini["observation_date"].dt.year
monthly_index = pd.date_range(start=gini["observation_date"].min(), end=gini["observation_date"].max(), freq="MS")
monthly_gini = pd.DataFrame({"observation_date": monthly_index})
monthly_gini["year"] = monthly_gini["observation_date"].dt.year
monthly_gini = monthly_gini.merge(gini[["year", "SIPOVGINIUSA"]], on="year", how="left")
monthly_gini = monthly_gini[["observation_date", "SIPOVGINIUSA"]]

# Combined dataset
data_combined = pd.merge(percent_manufacturing, adjusted_wages, on="observation_date", how="inner")
data_combined = pd.merge(data_combined, monthly_gini, on="observation_date", how="inner")
data_combined = data_combined[["observation_date","percent_manufacturing","real_wages_manufacturing", "real_wages_all", "SIPOVGINIUSA", "CES3000000008"]]
data_combined = data_combined.rename(columns={"SIPOVGINIUSA": "gini", "CES3000000008": "cpi"})
data_combined = data_combined.dropna()
display(data_combined)
data_combined.to_csv("data/datacombined.csv")

# Early visualizations
for label, content in data_combined.drop("observation_date",axis=1).items():
    years=data_combined["observation_date"]
    plt.plot(years, content)
    plt.title(f"{label}")
    plt.xticks(rotation=45)
    plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(nbins=10))
    plt.show()

# Checking for stationarity in the data
def stationarity(dataset):
    for label, content in dataset.drop("observation_date",axis=1).items():
        ADF = adfuller(content, regression='ct')
        print(f"Stationarity testing for {label}:\n--------------------")
        print("DF-GLS statistic: %f" % ADF[0])
        print("DF-GLS p-value: %f" % ADF[1])

        # KPSS Test
        KPSS = kpss(content, regression='ct')
        print("KPSS test statistic: %f" % KPSS[0])
        print("KPSS p-value: %f" % KPSS[1])
        print("--------------------")

# Differencing the data
data_differenced = data_combined.diff().dropna()
stationarity(data_differenced)

Stationarity testing for percent_manufacturing:
--------------------
DF-GLS statistic: -8.792263
DF-GLS p-value: 0.000000
KPSS test statistic: 0.080311
KPSS p-value: 0.100000
--------------------
Stationarity testing for real_wages_manufacturing:
--------------------
DF-GLS statistic: -6.445318
DF-GLS p-value: 0.000000
KPSS test statistic: 0.204149
KPSS p-value: 0.014444
--------------------
Stationarity testing for real_wages_all:
--------------------
DF-GLS statistic: -4.610740
DF-GLS p-value: 0.000989
KPSS test statistic: 0.250300
KPSS p-value: 0.010000
--------------------
Stationarity testing for gini:
--------------------
DF-GLS statistic: -26.801851
DF-GLS p-value: 0.000000
KPSS test statistic: 0.077690
KPSS p-value: 0.100000
--------------------
Stationarity testing for cpi:
--------------------
DF-GLS statistic: -0.060434
DF-GLS p-value: 0.993567
KPSS test statistic: 0.353923
KPSS p-value: 0.010000
--------------------

C:\Users\yyork\AppData\Local\Temp\ipykernel_42556\2761193709.py:10: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  KPSS = kpss(content, regression='ct')
C:\Users\yyork\AppData\Local\Temp\ipykernel_42556\2761193709.py:10: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  KPSS = kpss(content, regression='ct')
C:\Users\yyork\AppData\Local\Temp\ipykernel_42556\2761193709.py:10: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  KPSS = kpss(content, regression='ct')
C:\Users\yyork\AppData\Local\Temp\ipykernel_42556\2761193709.py:10: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  KPSS = kpss(content, regression='ct')

# Checking for structural breaks in the data
def break_points(dataset):
    for label, content in dataset.drop("observation_date",axis=1).items():
        years = data_combined["observation_date"]
        signal = content.values
        model = "l2"
        algo = rpt.Pelt(model=model).fit(signal)    
        breaks = algo.predict(pen=5)
        print("Break points at indices:", breaks)
        break_indices = breaks[:-1]  # Exclude the last index (end of series)
        date_labels = years.iloc[break_indices]

        plt.figure(figsize=(10, 4))
        plt.plot(years, content)
        for idx in break_indices:
            plt.axvline(x=years.iloc[idx], color='red', linestyle='--')
        plt.xticks(rotation=45)
        plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(nbins=10))
        plt.xlabel("Date")
        plt.ylabel("Value")
        plt.title(f"Break points with dates for {label}")
        plt.tight_layout()
        plt.show()

break_points(data_combined)

Break points at indices: [721]

Break points at indices: [85, 285, 625, 721]

Break points at indices: [50, 190, 300, 415, 540, 660, 721]

Break points at indices: [45, 120, 165, 215, 230, 300, 345, 445, 490, 600, 675, 695, 721]

Break points at indices: [55, 95, 125, 155, 180, 200, 220, 250, 290, 320, 355, 390, 425, 455, 485, 520, 545, 590, 625, 650, 670, 690, 705, 721]

# Finding break points for the non-interpolated gini

years = gini["observation_date"]
signal = gini["SIPOVGINIUSA"].values
model = "l2"
algo = rpt.Pelt(model=model).fit(signal)    
breaks = algo.predict(pen=5)
print("Break points at indices:", breaks)
break_indices = breaks[:-1]  # Exclude the last index (end of series)
date_labels = years.iloc[break_indices]

plt.figure(figsize=(10, 4))
plt.plot(years, gini["SIPOVGINIUSA"])
for idx in break_indices:
    plt.axvline(x=years.iloc[idx], color='red', linestyle='--')
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(nbins=10))
plt.xlabel("Date")
plt.ylabel("Value")
plt.title("Break points with dates for the gini coefficient")
plt.tight_layout()
plt.show()

Break points at indices: [10, 20, 30, 40, 62]

# Granger causality test
def granger(input):
    return(grangercausalitytests(input, 4))
    
granger(data_differenced[["gini","percent_manufacturing"]])
granger(data_differenced[["percent_manufacturing","real_wages_manufacturing"]])

# Vector autoregression
def granger_var(input):
    model = VAR(input)
    results = model.fit(maxlags=10, ic='aic')
    return(results.summary())

# granger_var(data_differenced[["SIPOVGINIUSA","percent_manufacturing"]])
def cointegration(input):
    score, pvalue, _ = coint(input[input.columns[0]], input[input.columns[1]])
    print(f"Cointegration test statistic: {score}")
    print(f"Cointegration p-value: {pvalue}")
    
cointegration(data_combined[["gini","percent_manufacturing"]])
cointegration(data_combined[["real_wages_manufacturing", "real_wages_all"]])

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0586  , p=0.8088  , df_denom=716, df_num=1
ssr based chi2 test:   chi2=0.0588  , p=0.8084  , df=1
likelihood ratio test: chi2=0.0588  , p=0.8084  , df=1
parameter F test:         F=0.0586  , p=0.8088  , df_denom=716, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.0317  , p=0.1319  , df_denom=713, df_num=2
ssr based chi2 test:   chi2=4.0920  , p=0.1293  , df=2
likelihood ratio test: chi2=4.0804  , p=0.1300  , df=2
parameter F test:         F=2.0317  , p=0.1319  , df_denom=713, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.4291  , p=0.2330  , df_denom=710, df_num=3
ssr based chi2 test:   chi2=4.3294  , p=0.2280  , df=3
likelihood ratio test: chi2=4.3164  , p=0.2293  , df=3
parameter F test:         F=1.4291  , p=0.2330  , df_denom=710, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.1165  , p=0.3475  , df_denom=707, df_num=4
ssr based chi2 test:   chi2=4.5227  , p=0.3399  , df=4
likelihood ratio test: chi2=4.5085  , p=0.3415  , df=4
parameter F test:         F=1.1165  , p=0.3475  , df_denom=707, df_num=4

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.8364  , p=0.3607  , df_denom=716, df_num=1
ssr based chi2 test:   chi2=0.8400  , p=0.3594  , df=1
likelihood ratio test: chi2=0.8395  , p=0.3596  , df=1
parameter F test:         F=0.8364  , p=0.3607  , df_denom=716, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.0013  , p=0.1359  , df_denom=713, df_num=2
ssr based chi2 test:   chi2=4.0306  , p=0.1333  , df=2
likelihood ratio test: chi2=4.0193  , p=0.1340  , df=2
parameter F test:         F=2.0013  , p=0.1359  , df_denom=713, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.4545  , p=0.2258  , df_denom=710, df_num=3
ssr based chi2 test:   chi2=4.4066  , p=0.2208  , df=3
likelihood ratio test: chi2=4.3931  , p=0.2220  , df=3
parameter F test:         F=1.4545  , p=0.2258  , df_denom=710, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.5722  , p=0.1799  , df_denom=707, df_num=4
ssr based chi2 test:   chi2=6.3690  , p=0.1732  , df=4
likelihood ratio test: chi2=6.3409  , p=0.1751  , df=4
parameter F test:         F=1.5722  , p=0.1799  , df_denom=707, df_num=4
Cointegration test statistic: -2.476876439015145
Cointegration p-value: 0.2893613365022458
Cointegration test statistic: -2.903154275973649
Cointegration p-value: 0.1350564964625618

# Segment-wise cointegration test based on breakpoints
all_wages_breaks = [0, 50, 195, 450, 620, 721]
manu_wages_breaks = [0, 85, 280, 721]
gini_breaks =  [0, 120, 240, 360, 480, 732]

results = []
for i in range(len(all_wages_breaks)-1):
    # Find overlap for this segment
    seg_start = max(all_wages_breaks[i], manu_wages_breaks[i] if i < len(manu_wages_breaks)-1 else manu_wages_breaks[-2])
    seg_end = min(all_wages_breaks[i+1], manu_wages_breaks[i+1] if i+1 < len(manu_wages_breaks) else manu_wages_breaks[-1])
    if seg_start >= seg_end:
        continue
    seg = data_combined.iloc[seg_start:seg_end]
    if len(seg) < 30:
        continue  # skip very short segments
    score, pvalue, _ = coint(seg["real_wages_manufacturing"], seg["real_wages_all"])
    results.append({
        "segment": (seg_start, seg_end),
        "n_obs": len(seg),
        "statistic": score,
        "pvalue": pvalue
    })

for r in results:
    print(f"Segment {r['segment'][0]}–{r['segment'][1]} (n={r['n_obs']}): stat={r['statistic']:.2f}, p={r['pvalue']:.3f}")

Segment 0–50 (n=50): stat=-4.50, p=0.001
Segment 85–195 (n=110): stat=-2.05, p=0.500
Segment 280–450 (n=170): stat=-2.09, p=0.481
Segment 450–620 (n=170): stat=-2.60, p=0.237
Segment 620–721 (n=101): stat=-1.58, p=0.731

# Segment-wise cointegration test: manufacturing wages vs. gini (using non-interpolated gini breakpoints)
gini_breaks_monthly = gini_breaks  # <-- Replace with your actual indices from previous cell

results_gini = []
for i in range(len(gini_breaks_monthly)-1):
    seg_start = gini_breaks_monthly[i]
    seg_end = gini_breaks_monthly[i+1]
    if seg_start >= seg_end:
        continue
    seg = data_combined.iloc[seg_start:seg_end]
    if len(seg) < 30:
        continue  # skip very short segments
    score, pvalue, _ = coint(seg["real_wages_manufacturing"], seg["gini"])
    results_gini.append({
        "segment": (seg_start, seg_end),
        "n_obs": len(seg),
        "statistic": score,
        "pvalue": pvalue
    })

for r in results_gini:
    print(f"Segment {r['segment'][0]}–{r['segment'][1]} (n={r['n_obs']}): stat={r['statistic']:.2f}, p={r['pvalue']:.3f}")
    
gini_break_indices_annual = [10, 20, 30, 40, 61]
gini_break_dates = gini.iloc[gini_break_indices_annual]["observation_date"].values
# Find the corresponding indices in monthly_gini
monthly_gini_break_indices = [monthly_gini.index[monthly_gini["observation_date"] == d][0] for d in gini_break_dates]
print("Break dates in annual Gini:", gini_break_dates)
print("Corresponding indices in monthly_gini:", monthly_gini_break_indices)

Segment 0–120 (n=120): stat=-1.40, p=0.796
Segment 120–240 (n=120): stat=-1.31, p=0.826
Segment 240–360 (n=120): stat=-1.48, p=0.768
Segment 360–480 (n=120): stat=-1.61, p=0.715
Segment 480–732 (n=241): stat=-0.92, p=0.916

# For Choropleth map -- merging together the state-level data.

import functools

state_files = {
    "illinois":      ("data/illinois_data.csv",     "ILMFG"),
    "indiana":       ("data/indiana_data.csv",       "INMFG"),
    "michigan":      ("data/michigan_data.csv",      "MIMFG"),
    "ohio":          ("data/ohio_data.csv",           "OHMFG"),
    "pennsylvania":  ("data/pennsylvania_data.csv",  "PAMFG"),
    "new_york":      ("data/newyork_data.csv",        "NYMFG"),
    "wisconsin":     ("data/wisconsin_data.csv",      "WIMFG"),
    "west_virginia": ("data/west_virginia_data.csv", "WVMFG"),
}

dfs = []
for state, (filepath, series_col) in state_files.items():
    df = pd.read_csv(filepath)
    df = df.rename(columns={series_col: state})
    df["observation_date"] = pd.to_datetime(df["observation_date"])
    dfs.append(df[["observation_date", state]])

state_employment = functools.reduce(
    lambda left, right: pd.merge(left, right, on="observation_date", how="inner"),
    dfs
)

display(state_employment)

# Converting everything to JS 
state_cols = [c for c in state_employment.columns if c != "observation_date"]

state_employment_scaled = state_employment.copy()
state_employment_scaled[state_cols] = state_employment[state_cols].apply(
    lambda col: (col - col.min()) / (col.max() - col.min())
)

display(state_employment_scaled)

# FIPS codes for each state column
fips_map = {
    "illinois":      "17",
    "indiana":       "18",
    "michigan":      "26",
    "ohio":          "39",
    "pennsylvania":  "42",
    "new_york":      "36",
    "wisconsin":     "55",
    "west_virginia": "54",
}

years_to_export = range(1990, 2025, 5)

snapshots = {}
for year in years_to_export:
    scaled_rows = state_employment_scaled[state_employment_scaled["observation_date"].dt.year == year]
    raw_rows = state_employment[state_employment["observation_date"].dt.year == year]
    if len(scaled_rows) == 0:
        continue
    scaled_row = scaled_rows.iloc[0]
    raw_row = raw_rows.iloc[0]
    entries = []
    for state, fips in fips_map.items():
        value = round(scaled_row[state], 4)
        raw = round(raw_row[state], 1)
        entries.append(f'    {{ fips: "{fips}", name: "{state.replace("_", " ").title()}", value: {value}, raw: {raw} }}')
    snapshots[str(year)] = entries

lines = ["var rustBeltData = {"]
for i, (year, entries) in enumerate(snapshots.items()):
    comma = "," if i < len(snapshots) - 1 else ""
    lines.append(f'  "{year}": [\n' + ",\n".join(entries) + f"\n  ]{comma}")
lines.append("};")

js_content = "\n".join(lines)
with open("data/rustbelddata.js", "w") as f:
    f.write(js_content)
print(js_content[:600])

latest_raw = state_employment.iloc[-1]

rows_raw = []
for state, fips in fips_map.items():
    value = round(latest_raw[state], 1)
    rows_raw.append(f'  {{ fips: "{fips}", name: "{state.replace("_", " ").title()}", value: {value} }}')

js_content_raw = "var rustBeltDataRaw = [\n" + ",\n".join(rows_raw) + "\n];"

with open("data/rustbeltdata_raw.js", "w") as f:
    f.write(js_content_raw)

print(js_content_raw)

	observation_date	percent_manufacturing	real_wages_manufacturing	real_wages_all	gini	cpi
0	1964-01-01	0.217190	7.692308	8.080155	37.4	2.38
1	1964-02-01	0.216584	7.699774	8.087997	37.4	2.38
2	1964-03-01	0.216880	7.692308	8.112476	37.4	2.38
3	1964-04-01	0.215424	7.754443	8.142165	37.4	2.40
4	1964-05-01	0.215437	7.746934	8.134280	37.4	2.40
...	...	...	...	...	...	...
716	2023-09-01	0.076745	8.663221	9.496349	41.8	26.62
717	2023-10-01	0.076565	8.664396	9.509386	41.8	26.66
718	2023-11-01	0.076501	8.713346	9.531134	41.8	26.85
719	2023-12-01	0.076899	8.793779	9.545218	41.8	27.15
720	2024-01-01	0.076954	8.773063	9.560927	41.8	27.17

	observation_date	illinois	indiana	michigan	ohio	pennsylvania	new_york	wisconsin	west_virginia
0	1990-01-01	923.7	605.6	810.0	1044.4	965.4	1009.7	524.5	81.9
1	1990-02-01	927.3	607.3	836.0	1061.6	964.5	1007.3	527.9	82.1
2	1990-03-01	927.2	609.6	848.3	1064.6	962.5	1005.4	526.8	82.5
3	1990-04-01	923.2	609.6	843.3	1068.6	958.4	995.8	525.1	82.7
4	1990-05-01	924.1	609.7	838.1	1068.6	957.1	994.6	525.3	82.7
...	...	...	...	...	...	...	...	...	...
430	2025-11-01	563.9	516.1	581.9	679.4	552.3	398.8	454.8	44.9
431	2025-12-01	562.5	515.4	582.5	679.3	552.4	399.5	454.6	45.0
432	2026-01-01	569.2	515.7	578.7	677.7	554.4	399.5	451.6	44.9
433	2026-02-01	567.8	518.1	579.0	677.8	551.8	397.3	451.8	44.9
434	2026-03-01	567.1	520.1	580.7	676.1	554.8	396.9	453.6	45.1

	observation_date	illinois	indiana	michigan	ohio	pennsylvania	new_york	wisconsin	west_virginia
0	1990-01-01	0.990842	0.738143	0.821284	0.948581	1.000000	1.000000	0.565593	0.970075
1	1990-02-01	1.000000	0.744918	0.873788	0.983957	0.998097	0.996351	0.585155	0.975062
2	1990-03-01	0.999746	0.754085	0.898627	0.990128	0.993868	0.993462	0.578826	0.985037
3	1990-04-01	0.989570	0.754085	0.888530	0.998355	0.985198	0.978866	0.569045	0.990025
4	1990-05-01	0.991860	0.754484	0.878029	0.998355	0.982449	0.977041	0.570196	0.990025
...	...	...	...	...	...	...	...	...	...
430	2025-11-01	0.075553	0.381427	0.360662	0.197861	0.126454	0.071157	0.164557	0.047382
431	2025-12-01	0.071992	0.378637	0.361874	0.197655	0.126665	0.072221	0.163406	0.049875
432	2026-01-01	0.089036	0.379833	0.354200	0.194364	0.130894	0.072221	0.146145	0.047382
433	2026-02-01	0.085474	0.389398	0.354806	0.194570	0.125396	0.068876	0.147296	0.047382
434	2026-03-01	0.083694	0.397369	0.358239	0.191074	0.131740	0.068268	0.157652	0.052369

Analysis: Identifying a relationship between wealth inequality and declines in American manufacturing¶

A.I. Use Statment¶

Works Cited¶

Articles referenced¶

Code¶

Cleaning and joining data¶

Analyses¶

Cross-variance and cointegration between wealth inequality and manufacturing: Trying to identify causality in the data¶

Looking for co-integration within shared regimes of the dataset¶

Exporting a JS-friendly version of the data for compatibility with D3¶