diff --git a/docs/control-server-guide.md b/docs/control-server-guide.md index 55c50da..a4295d6 100644 --- a/docs/control-server-guide.md +++ b/docs/control-server-guide.md @@ -4,38 +4,22 @@ **IP:** 192.168.1.127 **Location:** pve2 **User:** maddox -**Last Updated:** January 23, 2026 - ---- - -## Overview - -The control server is the centralized command center for managing the Proxmox cluster infrastructure. It provides: - -- **Passwordless SSH** to all 13 managed hosts -- **Ansible automation** for cluster-wide operations -- **tmux sessions** for multi-host management -- **Git-based configuration** synced to Forgejo +**Last Updated:** January 24, 2026 --- ## Quick Start -### Launch Interactive Menu ```bash +# Launch interactive menu ~/scripts/control-menu.sh -``` -### Launch Multi-Host SSH Session -```bash +# Or jump straight to multi-host SSH ~/scripts/ssh-manager.sh -``` -### Run Ansible Ad-Hoc Command -```bash +# Or run Ansible directly cd ~/clustered-fucks ansible all -m ping -ansible docker_hosts -m shell -a "docker ps --format 'table {{.Names}}\t{{.Status}}'" ``` --- @@ -46,9 +30,8 @@ ansible docker_hosts -m shell -a "docker ps --format 'table {{.Names}}\t{{.Statu /home/maddox/ ├── .ssh/ │ ├── config # SSH host definitions -│ ├── tmux-hosts.conf # tmux session configuration │ ├── id_ed25519 # SSH private key -│ └── id_ed25519.pub # SSH public key (add to new hosts) +│ └── id_ed25519.pub # Public key (add to new hosts) │ ├── clustered-fucks/ # Git repo (synced to Forgejo) │ ├── ansible.cfg # Ansible configuration @@ -56,12 +39,18 @@ ansible docker_hosts -m shell -a "docker ps --format 'table {{.Names}}\t{{.Statu │ │ ├── hosts.yml # Host inventory │ │ └── group_vars/ │ │ └── all.yml # Global variables -│ └── playbooks/ -│ ├── check-status.yml -│ ├── docker-prune.yml -│ ├── restart-utils.yml -│ ├── update-all.yml -│ └── deploy-utils.yml +│ ├── playbooks/ +│ │ ├── check-status.yml +│ │ ├── docker-prune.yml +│ │ ├── restart-utils.yml +│ │ ├── update-all.yml +│ │ ├── deploy-utils.yml +│ │ └── deploy-mealie.yml # NEW +│ └── compose-files/ # Docker compose files for services +│ └── databases/ +│ └── mealie/ +│ ├── docker-compose.yml +│ └── .env # NOT in git (secrets) │ └── scripts/ ├── ssh-manager.sh # tmux multi-host launcher @@ -73,274 +62,233 @@ ansible docker_hosts -m shell -a "docker ps --format 'table {{.Names}}\t{{.Statu ## Managed Hosts -| Host | IP | User | Port | Type | Group | -|------|-----|------|------|------|-------| -| pve2 | .3 | root | 22 | Proxmox | proxmox_nodes | -| pve-dell | .4 | root | 22 | Proxmox | proxmox_nodes | -| replicant | .80 | maddox | 22 | VM | docker_hosts | -| databases | .81 | root | 22 | VM | docker_hosts | -| immich | .82 | root | 22 | VM | docker_hosts | -| media-transcode | .120 | root | 22 | LXC | docker_hosts | -| network-services | .121 | root | 22 | LXC | docker_hosts | -| download-stack | .122 | root | 22 | LXC | docker_hosts | -| docker666 | .123 | root | 22 | LXC | docker_hosts | -| tailscale-home | .124 | root | 22 | LXC | docker_hosts | -| dns-lxc | .125 | root | 22 | LXC | infrastructure | -| nas | .251 | maddox | 44822 | NAS | legacy | -| alien | .252 | maddox | 22 | Docker | legacy | +| Host | IP | User | Type | Ansible Group | +|------|-----|------|------|---------------| +| pve2 | .3 | root | Proxmox | proxmox_nodes | +| pve-dell | .4 | root | Proxmox | proxmox_nodes | +| replicant | .80 | maddox | VM | docker_hosts | +| databases | .81 | root | VM | docker_hosts | +| immich | .82 | root | VM | docker_hosts | +| media-transcode | .120 | root | LXC | docker_hosts | +| network-services | .121 | root | LXC | docker_hosts | +| download-stack | .122 | root | LXC | docker_hosts | +| docker666 | .123 | root | LXC | docker_hosts | +| tailscale-home | .124 | root | LXC | docker_hosts | +| dns-lxc | .125 | root | LXC | infrastructure | +| nas | .251 | maddox | NAS | legacy | +| alien | .252 | maddox | Docker | legacy | --- -## Ansible Host Groups +## Ansible Playbooks -| Group | Members | Use Case | -|-------|---------|----------| -| `all` | All 13 hosts | Connectivity tests | -| `docker_hosts` | 8 hosts | Docker operations | -| `all_managed` | 11 hosts | System updates | -| `proxmox_nodes` | pve2, pve-dell | Node-level ops | -| `infrastructure` | dns-lxc | Non-Docker infra | -| `legacy` | nas, alien | Manual operations | -| `vms` | replicant, databases, immich | VM-specific | -| `lxcs` | 6 LXC containers | LXC-specific | +| Playbook | Target | Description | +|----------|--------|-------------| +| `check-status.yml` | all_managed | Disk, memory, container counts | +| `update-all.yml` | docker_hosts | apt upgrade all hosts | +| `docker-prune.yml` | docker_hosts | Clean unused Docker resources | +| `restart-utils.yml` | docker_hosts | Restart utils stack | +| `deploy-utils.yml` | docker_hosts | Deploy utils to new host | +| `deploy-mealie.yml` | databases | Deploy Mealie stack | ---- +### Running Playbooks -## Playbooks Reference - -### check-status.yml -Reports disk usage, memory usage, and container counts. - -```bash -ansible-playbook playbooks/check-status.yml -``` - -**Target:** all_managed -**Output:** Per-host status line (Disk=X% Mem=X% Containers=X) - ---- - -### update-all.yml -Runs apt update and upgrade on all Docker hosts. - -```bash -ansible-playbook playbooks/update-all.yml - -# With reboot if required: -ansible-playbook playbooks/update-all.yml -e "reboot=true" -``` - -**Target:** docker_hosts -**Note:** Checks for reboot requirement, notifies but doesn't auto-reboot unless `-e "reboot=true"` - ---- - -### docker-prune.yml -Cleans unused Docker resources (images, networks, build cache). - -```bash -ansible-playbook playbooks/docker-prune.yml -``` - -**Target:** docker_hosts -**Note:** dns-lxc will fail (no Docker) - this is expected - ---- - -### restart-utils.yml -Restarts the utils stack (watchtower, autoheal, docker-proxy) on all hosts. - -```bash -ansible-playbook playbooks/restart-utils.yml -``` - -**Target:** docker_hosts -**Note:** Uses host-specific `docker_appdata` variable for non-standard paths - ---- - -### deploy-utils.yml -Deploys standardized utils stack to a new host. - -```bash -ansible-playbook playbooks/deploy-utils.yml --limit new-host -``` - -**Target:** docker_hosts -**Note:** Creates directory structure and .env file only; compose file must be added separately - ---- - -## Scripts Reference - -### ssh-manager.sh - -Launches a tmux session with SSH connections to all hosts. - -```bash -~/scripts/ssh-manager.sh -``` - -**Features:** -- Window 0: Control (local shell) -- Windows 1-13: Individual host SSH sessions -- Final window: Multi-View (all hosts in split panes) - -**Navigation:** -- `Ctrl+b` then window number to switch -- `Ctrl+b d` to detach (keeps session running) -- `tmux attach -t cluster` to reattach - ---- - -### control-menu.sh - -Interactive menu for common operations. - -```bash -~/scripts/control-menu.sh -``` - -**Menu Options:** -``` -[1] Ping All - Test connectivity -[2] Check Status - Disk/memory/containers -[3] Update All - apt upgrade docker hosts -[4] Docker Prune - Clean unused resources -[5] Restart Utils - Restart utils stack everywhere - -[A] Ad-hoc Command - Run custom command -[I] Inventory - Show host list -[S] SSH Manager - Launch tmux session - -[Q] Quit -``` - ---- - -### add-host.sh - -Wizard for onboarding new hosts. - -```bash -~/scripts/add-host.sh -``` - -**Steps:** -1. Prompts for hostname, IP, user, port, description -2. Tests SSH connectivity -3. Copies SSH key if needed -4. Adds to `~/.ssh/config` -5. Adds to `~/.ssh/tmux-hosts.conf` - -**Note:** Ansible inventory must be edited manually. - ---- - -## Common Operations - -### SSH to a Specific Host -```bash -ssh replicant -ssh databases -ssh nas # Uses port 44822 automatically -``` - -### Run Command on All Docker Hosts ```bash cd ~/clustered-fucks -ansible docker_hosts -m shell -a "docker ps -q | wc -l" -``` -### Run Command on Specific Host -```bash -ansible replicant -m shell -a "df -h" -``` +# Check all hosts +ansible-playbook playbooks/check-status.yml -### Copy File to All Hosts -```bash -ansible docker_hosts -m copy -a "src=/path/to/file dest=/path/to/dest" -``` +# Update specific host +ansible-playbook playbooks/update-all.yml --limit databases -### Check Specific Service -```bash -ansible docker_hosts -m shell -a "docker ps --filter name=watchtower --format '{{.Status}}'" -``` - -### View Ansible Inventory -```bash -ansible-inventory --graph -ansible-inventory --list +# Deploy service +ansible-playbook playbooks/deploy-mealie.yml ``` --- ## Git Workflow -### Repository Location +### Repository Info - **Local:** `~/clustered-fucks/` - **Remote:** `ssh://git@192.168.1.81:2222/maddox/clustered-fucks.git` - **Web:** https://git.3ddbrewery.com/maddox/clustered-fucks -### Standard Workflow +### Daily Workflow + ```bash cd ~/clustered-fucks -# Make changes to playbooks/inventory +# 1. Before making changes, get latest +git pull + +# 2. Make your changes (edit files, create playbooks, etc.) vim playbooks/new-playbook.yml -# Commit and push +# 3. See what changed +git status + +# 4. Stage all changes git add -A -git commit -m "Add new playbook" -git push origin main + +# 5. Commit with a message +git commit -m "Add deployment playbook for service X" + +# 6. Push to Forgejo +git push ``` -### Pull Latest Changes +### After Creating/Editing Files + +**Always commit your changes!** Otherwise they only exist on the control server and could be lost. + ```bash cd ~/clustered-fucks -git pull origin main +git add -A +git commit -m "Description of what you changed" +git push +``` + +### Quick Reference + +| Command | What it does | +|---------|--------------| +| `git status` | Show what files have changed | +| `git pull` | Get latest changes from Forgejo | +| `git add -A` | Stage all changes for commit | +| `git commit -m "msg"` | Save changes with a message | +| `git push` | Upload commits to Forgejo | +| `git log --oneline -5` | Show last 5 commits | +| `git diff` | Show what changed (before staging) | + +### What Goes in Git vs What Doesn't + +**✅ Commit these:** +- Ansible playbooks (`playbooks/*.yml`) +- Inventory files (`inventory/hosts.yml`) +- Docker compose files (`compose-files/**/docker-compose.yml`) +- Documentation and scripts + +**❌ Never commit these:** +- `.env` files (contain passwords/secrets) +- Private keys +- Temporary files + +The `.gitignore` file already excludes `.env` files. + +--- + +## Service Migration Workflow + +### Standard Process + +```bash +cd ~/clustered-fucks + +# 1. Create compose file directory +mkdir -p compose-files// + +# 2. Create docker-compose.yml +vim compose-files///docker-compose.yml + +# 3. Create .env for secrets (not committed to git) +vim compose-files///.env + +# 4. Create Ansible playbook +vim playbooks/deploy-.yml + +# 5. Deploy via Ansible +ansible-playbook playbooks/deploy-.yml + +# 6. Rsync data from alien (if needed) +ssh alien "docker stop " +rsync -avP maddox@alien:/path/to/data/ root@:/home/docker/appdata// + +# 7. Start the service +ansible -m shell -a "cd /home/docker/appdata/ && docker compose up -d" + +# 8. Update Traefik route (on Hetzner) + +# 9. Test via domain + +# 10. Stop old container on alien +ssh alien "docker stop " + +# 11. Commit playbook to git +git add -A +git commit -m "Add deployment" +git push +``` + +### Playbook Template + +```yaml +--- +- name: Deploy to + hosts: + become: yes + vars: + service_name: + service_dir: /home/docker/appdata/{{ service_name }} + + tasks: + - name: Create directories + file: + path: "{{ item }}" + state: directory + mode: '0755' + loop: + - "{{ service_dir }}" + + - name: Ensure proxy network exists + community.docker.docker_network: + name: proxy + state: present + + - name: Copy docker-compose.yml + copy: + src: ../compose-files//{{ service_name }}/docker-compose.yml + dest: "{{ service_dir }}/docker-compose.yml" + mode: '0644' + + - name: Copy .env file + copy: + src: ../compose-files//{{ service_name }}/.env + dest: "{{ service_dir }}/.env" + mode: '0600' + + - name: Start stack + community.docker.docker_compose_v2: + project_src: "{{ service_dir }}" + state: present + recreate: always ``` --- -## Adding a New Host +## Common Operations -### 1. Run Onboarding Script +### SSH to Specific Host ```bash -~/scripts/add-host.sh +ssh databases +ssh alien +ssh nas # Uses port 44822 automatically ``` -### 2. Edit Ansible Inventory +### Run Command on All Docker Hosts ```bash -vim ~/clustered-fucks/inventory/hosts.yml +ansible docker_hosts -m shell -a "docker ps -q | wc -l" ``` -Add under appropriate group: -```yaml - new-host: - ansible_host: 192.168.1.XXX - ansible_user: root -``` - -If non-standard appdata path: -```yaml - new-host: - ansible_host: 192.168.1.XXX - ansible_user: root - docker_appdata: /custom/path/appdata -``` - -### 3. Test Connection +### Check Container Logs ```bash -ansible new-host -m ping +ansible databases -m shell -a "docker logs mealie 2>&1 | tail -30" ``` -### 4. Commit Changes +### Copy File to Host ```bash -cd ~/clustered-fucks -git add -A -git commit -m "Add new-host to inventory" -git push origin main +ansible databases -m copy -a "src=./file.txt dest=/tmp/file.txt" ``` --- @@ -349,14 +297,11 @@ git push origin main ### SSH Connection Refused ```bash -# Check if SSH is running on target -ssh -v hostname - -# If connection refused, access via Proxmox console: +# Access via Proxmox console: # For LXC: pct enter # For VM: qm terminal -# Inside container/VM: +# Then inside: apt install openssh-server systemctl enable ssh systemctl start ssh @@ -364,191 +309,24 @@ systemctl start ssh ### SSH Permission Denied ```bash -# Check key is in authorized_keys on target -ssh-copy-id hostname +# Copy your key to the host +ssh-copy-id -# If still failing, check permissions on target: -# (via Proxmox console) -chmod 700 ~ +# Or fix permissions on target: chmod 700 ~/.ssh chmod 600 ~/.ssh/authorized_keys -chown -R root:root ~/.ssh # or appropriate user ``` -### Ansible "Missing sudo password" -The host is configured with `ansible_become: yes` but no password is set. - -Fix: Either remove `ansible_become: yes` from inventory, or set up passwordless sudo on target: +### Git Push Fails ```bash -echo "username ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/username -``` +# Check remote is correct +git remote -v -### Playbook Skips Host -Check if host is in the correct group: -```bash -ansible-inventory --graph -``` +# Should show: +# origin ssh://git@192.168.1.81:2222/maddox/clustered-fucks.git -Check host variables: -```bash -ansible-inventory --host hostname -``` - -### Docker Command Not Found -Host is in `docker_hosts` but doesn't have Docker. Move to `infrastructure` group: -```yaml - infrastructure: - hosts: - hostname: - ansible_host: 192.168.1.XXX -``` - ---- - -## Non-Standard Configurations - -### Hosts with Different Appdata Paths - -| Host | Path | -|------|------| -| replicant | `/home/maddox/docker/appdata` | -| docker666 | `/root/docker/appdata` | -| All others | `/home/docker/appdata` | - -These are handled via `docker_appdata` variable in inventory. - -### Hosts with Non-Standard SSH - -| Host | Port | User | -|------|------|------| -| nas | 44822 | maddox | - -Configured in both `~/.ssh/config` and `inventory/hosts.yml`. - -### Hosts Without Utils Stack - -| Host | Reason | -|------|--------| -| tailscale-home | Only runs Headscale, no utils needed | -| dns-lxc | No Docker installed | - ---- - -## Maintenance - -### Update Ansible -```bash -sudo apt update -sudo apt upgrade ansible -``` - -### Regenerate SSH Keys (if compromised) -```bash -# Generate new key -ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 - -# Distribute to all hosts (will prompt for passwords) -for host in pve2 pve-dell replicant databases immich media-transcode network-services download-stack docker666 tailscale-home dns-lxc alien; do - ssh-copy-id $host -done - -# NAS requires special handling -ssh-copy-id -p 44822 maddox@192.168.1.251 -``` - -### Backup Configuration -```bash -cd ~/clustered-fucks -git add -A -git commit -m "Backup: $(date +%Y-%m-%d)" -git push origin main -``` - ---- - -## Reference Files - -### ~/.ssh/config -``` -Host * - StrictHostKeyChecking accept-new - ServerAliveInterval 60 - ServerAliveCountMax 3 - -Host pve2 - HostName 192.168.1.3 - User root - -Host pve-dell - HostName 192.168.1.4 - User root - -Host replicant - HostName 192.168.1.80 - User maddox - -Host databases - HostName 192.168.1.81 - User root - -Host immich - HostName 192.168.1.82 - User root - -Host media-transcode - HostName 192.168.1.120 - User root - -Host network-services - HostName 192.168.1.121 - User root - -Host download-stack - HostName 192.168.1.122 - User root - -Host docker666 - HostName 192.168.1.123 - User root - -Host tailscale-home - HostName 192.168.1.124 - User root - -Host dns-lxc - HostName 192.168.1.125 - User root - -Host nas - HostName 192.168.1.251 - User maddox - Port 44822 - -Host alien - HostName 192.168.1.252 - User maddox -``` - -### ~/clustered-fucks/ansible.cfg -```ini -[defaults] -inventory = inventory/hosts.yml -remote_user = root -host_key_checking = False -retry_files_enabled = False -gathering = smart -fact_caching = jsonfile -fact_caching_connection = /tmp/ansible_facts -fact_caching_timeout = 86400 -stdout_callback = yaml -forks = 10 - -[privilege_escalation] -become = False - -[ssh_connection] -pipelining = True -ssh_args = -o ControlMaster=auto -o ControlPersist=60s +# If wrong, fix it: +git remote set-url origin ssh://git@192.168.1.81:2222/maddox/clustered-fucks.git ``` --- @@ -557,4 +335,5 @@ ssh_args = -o ControlMaster=auto -o ControlPersist=60s | Date | Change | |------|--------| -| 2026-01-23 | Initial deployment, all hosts connected, playbooks tested | +| 2026-01-23 | Initial deployment | +| 2026-01-24 | Added deploy-mealie.yml, enhanced git workflow docs | \ No newline at end of file