import numpy as np
import plotly.graph_objects as go


# ------------------------------------------------------------------------------
# 1. Generate exactly collinear data
# ------------------------------------------------------------------------------
np.random.seed(42)
n = 50

# x2 is a perfect linear function of x1: x2 = 3*x1
x1 = np.linspace(0, 1, n)
x2 = 3 * x1

# True model (effectively y depends on x1 only)
b0_true = 2.0
b1_true = 4.0
noise = 0.5 * np.random.randn(n)
y = b0_true + b1_true * x1 + noise

# ------------------------------------------------------------------------------
# 2. Solve least squares (infinite solutions due to collinearity)
# ------------------------------------------------------------------------------
# Design matrix: X = [1, x1, x2], with x2 = 3*x1
X = np.column_stack((np.ones(n), x1, x2))
beta, residuals, rank, s = np.linalg.lstsq(X, y, rcond=None)
b0_est, b1_est, b2_est = beta

print("A particular least-squares solution (minimum-norm):")
print(f"  b0 = {b0_est:.3f}, b1 = {b1_est:.3f}, b2 = {b2_est:.3f}")
print(f"Rank of X = {rank} (expect 2, since one column is dependent)")

# ------------------------------------------------------------------------------
# 3. Define multiple hyperplanes by varying a free parameter t
# ------------------------------------------------------------------------------
# Because x2 = 3*x1, the perfect-fit condition is b1 + 3*b2 = constant.
# Let c = b1_est + 3*b2_est (the sum from the LLS solution).
# Then for each plane we let b2(t) = b2_est + t, b1(t) = c - 3*b2(t).
# ------------------------------------------------------------------------------
c = b1_est + 3*b2_est
t_values = np.linspace(-2, 2, 5)

# Create a figure
fig = go.Figure()

# 3D scatter of original data points
fig.add_trace(
    go.Scatter3d(
        x=x1,
        y=x2,
        z=y,
        mode='markers',
        marker=dict(size=4, color='blue'),
        name='Data Points'
    )
)

# A meshgrid for plotting surfaces
x1_grid = np.linspace(x1.min(), x1.max(), 20)
x2_grid = np.linspace(x2.min(), x2.max(), 20)
X1_surf, X2_surf = np.meshgrid(x1_grid, x2_grid)

# Plot several planes
for t in t_values:
    b2_t = b2_est + t
    b1_t = c - 3 * b2_t
    Y_surf = b0_est + b1_t * X1_surf + b2_t * X2_surf

    fig.add_trace(
        go.Surface(
            x=X1_surf,
            y=X2_surf,
            z=Y_surf,
            opacity=0.5,
            colorscale='Oranges',
            showscale=False,
            name=f"Plane t={t:.2f}",
        )
    )

# ------------------------------------------------------------------------------
# 4. Add the common intersection line
# ------------------------------------------------------------------------------
# All these planes intersect along the line where x2 = 3*x1 and
# y = b0_est + c*x1, with c = b1_est + 3*b2_est.
# Let's plot that "ridge" over a chosen x1 range.
# ------------------------------------------------------------------------------
x1_line = np.linspace(x1.min(), x1.max(), 50)
x2_line = 3 * x1_line
y_line = b0_est + c * x1_line  # y = b0 + (b1 + 3*b2)*x1

fig.add_trace(
    go.Scatter3d(
        x=x1_line,
        y=x2_line,
        z=y_line,
        mode='lines',
        line=dict(color='red', width=5),
        name='Common Intersection Line'
    )
)

# Layout for a nice interactive 3D view
fig.update_layout(
    title="Infinite Solutions: Exact Multicollinearity in Multiple Linear Regression",
    scene=dict(
        xaxis_title='x1',
        yaxis_title='x2',
        zaxis_title='y',
        aspectmode='cube'
    )
)

fig.show()

A particular least-squares solution (minimum-norm):
  b0 = 2.032, b1 = 0.371, b2 = 1.113
Rank of X = 2 (expect 2, since one column is dependent)


# ------------------------------------------------------------------------------
# 1. Generate two near-collinear datasets from the same underlying model
# ------------------------------------------------------------------------------

# np.random.seed(42)

# Number of points in each dataset
n = 50

# "near-collinearity" parameter
small_noise_scale = 0.05

# True underlying coefficients for the model: y = b0 + b1*x1 + b2*x2
b0_true = 2.0
b1_true = 3.0
b2_true = 1.5

# -------- Dataset A --------
x1_A = np.linspace(0, 1, n)
# near-collinear with x1_A
x2_A = 3*x1_A + small_noise_scale * np.random.randn(n)
noise_A = 0.5 * np.random.randn(n)

y_A = b0_true + b1_true*x1_A + b2_true*x2_A + noise_A

# -------- Dataset B --------
x1_B = np.linspace(0, 1, n)
# near-collinear with x1_B, but different noise
x2_B = 3*x1_B + small_noise_scale * np.random.randn(n)
noise_B = 0.5 * np.random.randn(n)

y_B = b0_true + b1_true*x1_B + b2_true*x2_B + noise_B

# ------------------------------------------------------------------------------
# 2. Fit each dataset independently using least squares
# ------------------------------------------------------------------------------

# Fit dataset A
X_A = np.column_stack((np.ones(n), x1_A, x2_A))
beta_A, _, _, _ = np.linalg.lstsq(X_A, y_A, rcond=None)
b0_est_A, b1_est_A, b2_est_A = beta_A

# Fit dataset B
X_B = np.column_stack((np.ones(n), x1_B, x2_B))
beta_B, _, _, _ = np.linalg.lstsq(X_B, y_B, rcond=None)
b0_est_B, b1_est_B, b2_est_B = beta_B

# Print the estimated coefficients
print("Dataset A estimated coefficients:")
print(f"  b0 = {b0_est_A:.3f}, b1 = {b1_est_A:.3f}, b2 = {b2_est_A:.3f}")

print("\nDataset B estimated coefficients:")
print(f"  b0 = {b0_est_B:.3f}, b1 = {b1_est_B:.3f}, b2 = {b2_est_B:.3f}")

# ------------------------------------------------------------------------------
# 3. Plot both datasets (scatter) + fitted planes in a single 3D figure
# ------------------------------------------------------------------------------

fig = go.Figure()

# ---- Scatter of Dataset A ----
fig.add_trace(
    go.Scatter3d(
        x=x1_A,
        y=x2_A,
        z=y_A,
        mode='markers',
        marker=dict(size=4, color='blue'),
        name='Data A Points'
    )
)

# ---- Scatter of Dataset B ----
fig.add_trace(
    go.Scatter3d(
        x=x1_B,
        y=x2_B,
        z=y_B,
        mode='markers',
        marker=dict(size=4, color='red'),
        name='Data B Points'
    )
)

# Create a meshgrid for plotting surfaces
grid_size = 20
x1_grid = np.linspace(0, 1, grid_size)
x2_grid = np.linspace(0, 3, grid_size)  # since x2 ~ 3*x1
X1_surf, X2_surf = np.meshgrid(x1_grid, x2_grid)

# ---- Plane for Dataset A ----
Y_surf_A = (b0_est_A
            + b1_est_A * X1_surf
            + b2_est_A * X2_surf)

fig.add_trace(
    go.Surface(
        x=X1_surf,
        y=X2_surf,
        z=Y_surf_A,
        opacity=0.4,
        colorscale='Blues',
        showscale=False,
        name='Fitted Plane A'
    )
)

# ---- Plane for Dataset B ----
Y_surf_B = (b0_est_B
            + b1_est_B * X1_surf
            + b2_est_B * X2_surf)

fig.add_trace(
    go.Surface(
        x=X1_surf,
        y=X2_surf,
        z=Y_surf_B,
        opacity=0.4,
        colorscale='Reds',
        showscale=False,
        name='Fitted Plane B'
    )
)

# Final layout
fig.update_layout(
    title="Two Near-Collinear Datasets and Their Fitted Planes",
    scene=dict(
        xaxis_title='x1',
        yaxis_title='x2',
        zaxis_title='y',
        aspectmode='cube'
    )
)

fig.show()

Dataset A estimated coefficients:
  b0 = 2.034, b1 = 11.165, b2 = -1.256

Dataset B estimated coefficients:
  b0 = 2.254, b1 = 4.857, b2 = 0.764


# ------------------------------------------------------------------------------
# 1. Generate two independent (non-collinear) datasets from the same model
# ------------------------------------------------------------------------------

# np.random.seed(42)

n = 50  # number of points in each dataset

# True underlying model: y = b0 + b1*x1 + b2*x2
b0_true = 2.0
b1_true = 3.0
b2_true = 1.5


# --- Dataset A ---
x1_A = np.random.rand(n)  # uniform in [0, 1]
x2_A = np.random.rand(n)  # uniform in [0, 1] (independent of x1)
noise_A = 0.5 * np.random.randn(n)
y_A = b0_true + b1_true * x1_A + b2_true * x2_A + noise_A

# --- Dataset B ---
x1_B = np.random.rand(n)
x2_B = np.random.rand(n)
noise_B = 0.5 * np.random.randn(n)
y_B = b0_true + b1_true * x1_B + b2_true * x2_B + noise_B

# ------------------------------------------------------------------------------
# 2. Fit each dataset with least squares
# ------------------------------------------------------------------------------

# Fit dataset A
X_A = np.column_stack((np.ones(n), x1_A, x2_A))
beta_A, _, _, _ = np.linalg.lstsq(X_A, y_A, rcond=None)
b0_est_A, b1_est_A, b2_est_A = beta_A

# Fit dataset B
X_B = np.column_stack((np.ones(n), x1_B, x2_B))
beta_B, _, _, _ = np.linalg.lstsq(X_B, y_B, rcond=None)
b0_est_B, b1_est_B, b2_est_B = beta_B

print("Dataset A estimated coefficients:")
print(f"  b0 = {b0_est_A:.3f}, b1 = {b1_est_A:.3f}, b2 = {b2_est_A:.3f}")

print("\nDataset B estimated coefficients:")
print(f"  b0 = {b0_est_B:.3f}, b1 = {b1_est_B:.3f}, b2 = {b2_est_B:.3f}")

# ------------------------------------------------------------------------------
# 3. Plot both datasets (scatter) + their fitted planes in a single 3D figure
# ------------------------------------------------------------------------------

fig = go.Figure()

# --- Scatter of Dataset A ---
fig.add_trace(
    go.Scatter3d(
        x=x1_A,
        y=x2_A,
        z=y_A,
        mode='markers',
        marker=dict(size=4, color='blue'),
        name='Data A'
    )
)

# --- Scatter of Dataset B ---
fig.add_trace(
    go.Scatter3d(
        x=x1_B,
        y=x2_B,
        z=y_B,
        mode='markers',
        marker=dict(size=4, color='red'),
        name='Data B'
    )
)

# Create a meshgrid for plotting surfaces
grid_size = 20
x1_grid = np.linspace(0, 1, grid_size)
x2_grid = np.linspace(0, 1, grid_size)
X1_surf, X2_surf = np.meshgrid(x1_grid, x2_grid)

# --- Plane for Dataset A ---
Y_surf_A = (b0_est_A
            + b1_est_A * X1_surf
            + b2_est_A * X2_surf)

fig.add_trace(
    go.Surface(
        x=X1_surf,
        y=X2_surf,
        z=Y_surf_A,
        opacity=0.4,
        colorscale='Blues',
        showscale=False,
        name='Plane A'
    )
)

# --- Plane for Dataset B ---
Y_surf_B = (b0_est_B
            + b1_est_B * X1_surf
            + b2_est_B * X2_surf)

fig.add_trace(
    go.Surface(
        x=X1_surf,
        y=X2_surf,
        z=Y_surf_B,
        opacity=0.4,
        colorscale='Reds',
        showscale=False,
        name='Plane B'
    )
)

fig.update_layout(
    title="Two Datasets (No Collinearity) from the Same Model",
    scene=dict(
        xaxis_title='x1',
        yaxis_title='x2',
        zaxis_title='y',
        aspectmode='cube'
    )
)

fig.show()

Dataset A estimated coefficients:
  b0 = 2.049, b1 = 3.057, b2 = 1.416

Dataset B estimated coefficients:
  b0 = 1.714, b1 = 3.380, b2 = 1.573

Exact Multicollinearity¶

Near-collinearity: high variance of $\boldsymbol{\hat{\beta}}$¶

Almost no collinearity¶