#!/usr/bin/perl
#++
#   $Id: mhsync,v 1.17 2019/08/03 03:33:05 az Exp $
#
#   File:		mhsync
#   Date:		Thu Nov 24 12:44:45 2011
#   Author:		Alexander Zangerl (az)
#
#   Abstract: bidirectional sync between two mh hierarchies
#   uses ssh and sshfs
#
#   todos/missing features:
#	add exclusion mechanism for trash or other folders
#	find a way to sync sequence-only changes
#	don't overwrite sequence entries for unknown, brand-new files
#
#   copyright (c) 2011-2019 Alexander Zangerl <az@snafu.priv.at>
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License version 2
#   as published by the Free Software Foundation.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

use strict;
use Digest::MD5;
use File::Find;
use Getopt::Std;
use File::Temp qw(tempdir);
use File::Basename;
use Data::Dumper;
use Sys::Hostname;
use constant { INO=>0, SIZE=>1, CTIME=>2, MTIME=>3,
							 CID=>4, SEQS=>5, HERE=>0, THERE=>1  };

# sorta-configurables
my $mdir="$ENV{HOME}/Mail";
my $prefix="$ENV{HOME}/Mail/";
my $statfname=".mhsync";

my $maxdelta=200;

my %options;
my $usage="usage: $0 [-d N] [-n][-X] [-u|<host>]
-d: print debug info (default: only show nonresolvable conflicts)
-n: dry-run, make no mailbox changes
-X: apply heuristic for deleting old mail (default: always copy)
-u: only update local status file - otherwise update and sync with host\n";
die $usage if (!getopts("nd:Xu",\%options)
	       or (!$options{u} and @ARGV!=1));
$options{d}||=1 if ($options{n}); # ensure some debug output with dry-run mode

my $host=$ARGV[0];

# content object: {contentid} -> [0/1] -> [list of filenames]
# names object: {filename} ->[0/1]->[ino,size,ctime,mtime,contentid,[seqs,...]]
my (%content,%names);
my ($td, $lastsync, $othersidesync);
my $start=time;
my $me = lc(hostname);

# first read old status, then update from filesystem
$lastsync = readnames("$mdir/$statfname",\%names,HERE) if (-f "$mdir/$statfname");
updatenames($mdir,$prefix,\%names,HERE);

my %clashes;# key=cid, value=sol

# actual sync requested? then do data collection on remote side, too
if (!$options{u})
{
	$lastsync->{$host} //= 0;
	debug(2, "local lastsync for host $host is $lastsync->{$host}");

	# ditto, but on the remote host
	my $rc=0xffff & system("ssh","$host","mhsync","-u");
	die "update on $host failed: $!\n" if ($rc);

	$td=tempdir("/tmp/mhsync.XXXXXXX",CLEANUP=>0);
	$rc=0xffff & system("sshfs","$host:$mdir",$td);
	die "sshfs $host failed: $!" if ($rc);
	# no remote journal, earlier -u consumes it
	$othersidesync = readnames("$td/$statfname", \%names, THERE);
	debug(2, "remote lastsync for host $me is $othersidesync->{$me}");

	# now prime the content lookup hash
	for my $k (keys %names)
	{
		for my $cat (HERE,THERE)
		{
	    if (exists $names{$k}->[$cat])
	    {
				my $cid=$names{$k}->[$cat]->[CID];
				push @{$content{$cid}->[$cat]},$k;
	    }
		}
	}

	# most of the actual decisions happen here
	for my $cid (keys %content)
	{
		my $this=$content{$cid};

		my ($status,%where)=compareset($this->[HERE],$this->[THERE]);

		# same content and same names (any number thereof)
		if ($status)
		{
	    debug(3,"identical ".join(" ",@{$this->[HERE]}));
		}
		# content exists on one side only
		# if existing is older than lastsync (and delete allowed): delete existing side
		# otherwise copy to missing side
		elsif (@$this!=2 || !$this->[HERE] || ($this->[HERE] && !@{$this->[HERE]})
					 || !$this->[THERE] || ($this->[THERE] && !@{$this->[THERE]}))
		{
	    my $sol=$this->[HERE]?HERE:THERE; 	# present where?

	    my $newest=max(map { $names{$_}->[$sol]->[MTIME] } (@{$this->[$sol]}));
	    if ($newest < $lastsync->{$host} and $options{X})
	    {
				debug(1,join(" ",@{$this->[$sol]})." older than lastsync, deleting ".($sol?"there":"here"));
				for my $idx (0..$#{$this->[$sol]})
				{
					deletemail($cid,$sol,$idx);
				}
	    }
	    else
	    {
				debug(1,join(" ",@{$this->[$sol]})." missing on ".($sol?"this":"other")." side, copying");
				# what, good state
				syncstate($cid,$sol,%where);
	    }
		}
		# content exists on both sides, but different names/locations
		# find newest among the involved names, and use that side as
		# desired state. this heuristic fails with clash-renamed files,
		# as the renamed file is *certainly* newer :-/
		else
		{
	    my $hereage=max(map { $names{$_}->[HERE]->[MTIME],
														$names{$_}->[HERE]->[CTIME] }
											(@{$this->[HERE]}));
	    my $thereage=max(map { $names{$_}->[THERE]->[MTIME],
														 $names{$_}->[THERE]->[CTIME]}
											 (@{$this->[THERE]}));

	    my $sol=$hereage>$thereage?HERE:THERE;
	    debug(1,($sol?"remote ":"local ").join(" ",@{$this->[$sol]})
						." newer than ".($sol?"local ":"remote ")
						.join(" ",@{$this->[!$sol]}).", renaming "
						.($sol?"here":"there"));
	    # what, good state
	    syncstate($cid,$sol,%where);
		}
	}

	# now try to resolve clashes (e.g. caused by pack or sort folder)
	for my $cid (keys %clashes)
	{
		my $this=$content{$cid};
		my $sol=$clashes{$cid};
		my ($status,%where)=compareset($this->[HERE],$this->[THERE]);

		debug(1,"unclashing local ".join(" ",@{$this->[HERE]})
					.", remote ".join(" ",@{$this->[THERE]}).", want to rename "
					.($sol?"here":"there"));
		syncstate($cid,$sol,%where);
	}
}

# don't update the cache file if dry-running (as the new data is likely dud)
# ditto with sequences
if (!$options{n})
{
	# don't overwrite the lastsync timestamp when doing only a state update fixme
	$lastsync->{$host} = $start if (!$options{u});
	$othersidesync->{$me} = $start if (!$options{u});

	my $hereseqs = writenames("$mdir/$statfname",\%names, HERE, $lastsync); # local state
	my $thereseqs = writenames("$td/$statfname",\%names, THERE, $othersidesync) if ($td);  # remote state, only if not -u

	# write out the sequence files, but only if syncing
	if (!$options{u})
	{
		writesequences($mdir,$hereseqs);
		writesequences($td,$thereseqs);
	}
}

if ($td)
{
	my $rc=0xffff & system(qw(fusermount -u),$td);
	die "can't unmount $td: $!\n" if ($rc);
	rmdir($td);
}
exit 0;

sub fullpath
{
	my ($relpath,$whichside)=@_;
	return ($whichside==HERE?$mdir:$td)."/$relpath";
}

# push desired state for a single mail
# inputs: content id, index of good state, set count (1 this side, 2 other side, 3 both)
sub syncstate
{
	my ($cid,$source,%where)=@_;
	my $this=$content{$cid};

	# copy missing ones from source to dest; or do one-sided rename or copy if possible
	for my $idx (0..$#{$this->[$source]})
	{
		my $name=$this->[$source]->[$idx];
		next if ($where{$name}==3); # on both sides

		# ensure clash-free by renaming to (temporary) free name
		if (exists $names{$name}->[!$source])
		{
	    my $newname=findreplacement($name);
	    debug(1,"clash - must replace $name with $newname");
	    $clashes{$cid}=$source;
	    $name=$newname;
		}

		# can we rename one to fix this or do we have to copy?
		# what is already present on the correct side only?
		my ($therename)=grep $where{$_}==(!$source)+1, keys %where;
		if ($therename)
		{
	    debug(2,"renaming $therename to $name on ".($source?"this":"other")." side");
	    my ($renidx)=grep $this->[!$source]->[$_] eq $therename, (0..$#{$this->[!$source]});
	    renamemail($cid,!$source,$renidx,$name);
	    $where{$name}=3;
	    delete $where{$therename};
		}
		else
		{
	    # can we copy locally or do we have to go across the net?
	    my ($localav)=grep { $where{$_}==3 || $where{$_}==(!$source)+1 } keys %where;
	    if ($localav)
	    {
				debug(2,"same-side copy $localav to $name on ".($source?"this":"other")." side");
				my ($copyidx)=grep $this->[!$source]->[$_] eq $localav, (0..$#{$this->[!$source]});
				copymail($cid,!$source,$copyidx,$name,!$source);
	    }
	    else
	    {
				debug(2,"copying $name to ".($source?"this":"other")." side");
				copymail($cid,$source,$idx,$name,!$source);
	    }
	    $where{$name}=3;
		}
	}

	# finally remove unwanted ones on dest
	for my $idx (0..$#{$this->[!$source]})
	{
		my $name=$this->[!$source]->[$idx];

		next if ($where{$name}==3 or !$where{$name}); # on both sides
		debug(2,"removing $name on ".($source?"this":"other")." side");
		deletemail($cid,!$source,$idx);
	}
}


# rename mail identified by content+side+fn_index to new filename
# does ops same side only! also updates names and content datastructures
# dies on errors
sub renamemail
{
	my ($cid,$side,$sindex,$tfn)=@_;
	my $this=$content{$cid};
	my $sfn=$this->[$side]->[$sindex];

	debug(2,"\trename ".($side?"remote":"local")." $sfn to $tfn");
	my $source=fullpath($sfn,$side);
	my $target=fullpath($tfn,$side);
	if (!$options{n})
	{
		die "$tfn exists (unexpected), won't overwrite!\n" if (-f $target);
		my ($name,$path)=fileparse($target);
		if (!-d $path)
		{
	    my $thisdir;
	    for (split(/\//,$path))
	    {
				next if ($thisdir eq '/');
				$thisdir.="/$_";
				mkdir($thisdir,0755) if (!-d $thisdir);
	    }
		}
		rename($source,$target) or die "can't rename $sfn to $tfn: $!\n";
	}
	$names{$tfn}->[$side]=$names{$sfn}->[$side];
	delete $names{$sfn}->[$side];
	$this->[$side]->[$sindex]=$tfn;
}

# copy mail identified by content+side+index to new filename
# can operate across sides
# updates names and content datastructures
# dies on errors
sub copymail
{
	my ($cid,$sside,$sindex,$tfn,$tside)=@_;
	my $this=$content{$cid};
	my $sfn=$this->[$sside]->[$sindex];

	debug(2,"\tcopy ".($sside?"remote":"local")." $sfn to ".($tside?"remote":"local")." $tfn");
	my $source=fullpath($sfn,$sside);
	my $target=fullpath($tfn,$tside);
	if (!$options{n})
	{
		die "$tfn exists (unexpected), won't overwrite!\n" if (-f $target);
		my ($name,$path)=fileparse($target);
		if (!-d $path)
		{
	    my $thisdir;
	    for (split(/\//,$path))
	    {
				next if ($thisdir eq '/');
				$thisdir.="/$_";
				mkdir($thisdir,0755) if (!-d $thisdir);
	    }
		}
		my $rc=0xffff & system("cp","-an",$source,$target);
		die "can't copy $sfn to $tfn: $!\n" if ($rc);
	}

	$names{$tfn}->[$tside]=$names{$sfn}->[$sside];
	push @{$this->[$tside]},$tfn;
}

sub deletemail
{
	my ($cid,$side,$index)=@_;
	my $this=$content{$cid};
	my $sfn=$this->[$side]->[$index];

	debug(2,"\tdelete ".($side?"remote":"local"). " $sfn");
	if (!$options{n})
	{
		my $fp=fullpath($sfn,$side);
		unlink($fp) or die "can't unlink $sfn: $!\n";
	}

	delete $names{$sfn}->[$side];
	delete $this->[$side];
}



# overwrites sequence files with new info
sub writesequences
{
	my ($basedir,$seqs)=@_;

	for my $folder (keys %$seqs)
	{
		my @output;
		for my $k (keys %{$seqs->{$folder}})
		{
	    next if ($k eq "cur"); # pseudo-seq, not useful to sync
	    my @members=sort { $a <=> $b } keys %{$seqs->{$folder}->{$k}};
	    my @thisseq;

	    for (my $i=0; $i<=$#members; ++$i)
	    {
				my $start=$members[$i];
				my $j=$i;
				++$j while ($members[$j+1]==$members[$j]+1);

				push @thisseq, ($i==$j? $start: ("$start-".$members[$j]));
				$i=$j;
	    }
	    push @output, "$k: ".join(" ", @thisseq)."\n" if (@thisseq);
		}

		my $dest="$basedir/$folder/.mh_sequences";
		if (@output)
		{
	    debug(3,"seqs for folder $folder:\n\t".join("\t",@output));
	    open(F,">$dest") or die "can't write to $dest: $!\n";
	    print F @output;
	    close(F);
		}
		else
		{
	    debug(3,"no seqs for folder $folder");
	    unlink($dest) if (-f $dest);
		}
	}
}

sub expandmembers
{
	my ($string)=@_;
	my @res;
	for my $token (split(/\s+/,$string))
	{
		if ($token=~/^\d+$/)
		{
	    push @res,$token;
		}
		elsif ($token=~/^(\d+)-(\d+)$/)
		{
	    my ($start,$end)=($1,$2);
	    push @res, map { $_ } ($start..$end);
		}
		else
		{
	    die "unrecognized sequence \"$token\" in \"$string\"\n";
		}
	}
	return @res;
}

# use given filename and find filename in same folder that doesn't clash
sub findreplacement
{
	my ($templatefn)=@_;

	my ($head,$seqn);
	if ($templatefn=~m!^(\S+/,?)(\d+)$!)
	{
		($head,$seqn)=($1,$2);

		# first look backwards, then forwards
		for my $i (1..$maxdelta)
		{
	    last if ($seqn-$i<=1);
	    next if (exists $names{"$head".$seqn-$i});

	    $seqn=$seqn-$i;
	    last;
		}
		$seqn++ while(exists $names{"$head$seqn"});
		return "$head$seqn";
	}
	else
	{
		return undef;
	}
}

sub max
{
	my (@items)=@_;
	my $max;

	for (@items)
	{
		$max=$_ if ($_>=$max);
	}
	return $max;
}


sub debug
{
	my ($level,@msgs)=@_;

	if ($level<=$options{d})
	{
		for (@msgs)
		{
	    s/$/\n/ if !/\n$/;
	    print;
		}
	}
}

# reads previously dumped file names info into target
# parks it at index cat (0 local, 1 remote)
# returns lastsync time stamp - hashref
sub readnames
{
	my ($fn,$target,$cat)=@_;
	my %lastsync;

	open(F,$fn) or die "can't open $fn: $!\n";

	# first few lines: hostname:ts
	# rest: name lots of props
	for my $line (<F>)
	{
		chomp $line;
		if ($line =~ /^(\S+):(\d+)$/)
		{
			$lastsync{$1} = $2;
		}
		else
		{
			my ($n,$ino,$size,$ctime,$mtime,$cid,@seqs) = split(/ /, $line);
			next if (!$cid);					# unreadable lines
			$target->{$n}->[$cat]=[$ino,$size,$ctime,$mtime,$cid,(@seqs?[@seqs]:undef)];
    }
	}
	close(F);
	return \%lastsync;
}

# dumps file names info into file
# pulls info from cat (0 local, 1 remote)
# returns sequences hash (folder->name->members)
sub writenames
{
	my ($fn,$source,$cat,$lastsync)=@_;
	my %allseqs;

	open(F,">$fn") or die "can't open $fn: $!\n";
	print F map { "$_:$lastsync->{$_}\n" } (keys %$lastsync);

	for my $k (keys %$source)
	{
		next if (!$source->{$k}->[$cat]);
		my $seqs;
		if (my $memberships=$source->{$k}->[$cat]->[SEQS])
		{
	    $seqs=join(" ",@$memberships);
	    my ($fn,$msgno);
	    ($fn,$msgno)=($1,$2) if $k=~m!^(.+)/(\d+)$!;
	    for my $seqn (@$memberships)
	    {
				$allseqs{$fn}->{$seqn}->{$msgno}=1;
	    }
		}
		my $pdata="$k ".join(" ",@{$source->{$k}->[$cat]}[INO..CID])." $seqs\n";
		print F $pdata;
	}
	close(F);
	return \%allseqs;
}

# updates file names info with filesystem data
# modifies file name info in given cat
sub updatenames
{
	my ($path,$stripprefix,$target,$cat)=@_;
	my %seen;

	my @mustcheck;
	find(sub
			 { my ($fn,$full)=($_,$File::Find::name);
				 return if ($File::Find::dir=~/drafts$/);
				 push @mustcheck,$full if (-f $full && $fn=~/^\d+$/);
			 }, $path);
	for my $fn (@mustcheck)
	{
		my ($ino,$size,$ctime,$mtime)=(stat($fn))[1,7,10,9];
		my $shortname=$fn;
		$shortname=~s/^$stripprefix// if ($stripprefix);

		# do hashcheck if new or any attribute changed
		if (!$target || !$target->{$shortname} || !$target->{$shortname}->[$cat]
				|| $target->{$shortname}->[$cat]->[0]!=$ino
				|| $target->{$shortname}->[$cat]->[1]!=$size
				|| $target->{$shortname}->[$cat]->[2]!=$ctime
				|| $target->{$shortname}->[$cat]->[3]!=$mtime )
		{
	    $target->{$shortname}->[$cat]=[makedigest($fn)];
		}
		$seen{$shortname}=1;
	}

	# prune stuff that's gone
	for my $k (keys %$target)
	{
		delete $target->{$k}->[$cat] if !$seen{$k};
	}

	# read sequences for all involved dirs and update affected entries,
	# replacing any old sequence entries
	my (@dirs,%newseqs);
	find(sub
			 { my ($fn,$full)=($_,$File::Find::name);
				 push @dirs,$File::Find::dir if ($fn eq ".mh_sequences");
			 }, $path);
	for my $seqdir (@dirs)
	{
		my $seqfn="$seqdir/.mh_sequences";
		my $foldername=$seqdir; $foldername=~s!^$path/!!;

		debug(3,"reading $seqfn for folder $foldername");

		open(F,$seqfn) or die "can't read $seqfn: $!\n";
		for my $line (<F>)
		{
	    chomp $line;
	    my ($seqn,$members)=split(/:\s*/,$line);
	    my @all=expandmembers($members);
	    map { push @{$newseqs{"$foldername/$_"}},$seqn; } (@all);
		}
		close(F);
	}
	for my $k (keys %$target)
	{
		next if (!$target->{$k}->[$cat]); # orphaned sequence entries
		delete $target->{$k}->[$cat]->[5];
		$target->{$k}->[$cat]->[5]=$newseqs{$k};
	}
}

sub makedigest
{
	my ($fn)=@_;

	debug(2,"newly digesting $fn");

	my ($ino,$size,$ctime,$mtime)=(stat($fn))[1,7,10,9];
	my $md5=Digest::MD5->new;
	open(F,$fn) or die "can't open $fn: $!\n";
	binmode(F);
	$md5->addfile(*F);
	close(F);
	my $digest=$md5->hexdigest;

	return ($ino,$size,$ctime,$mtime,$digest);
}

# takes two array refs and compares them as sets
# returns (status, where)
# status 1 if equal, 0 otherwise
# where is hash of name=>loc, loc 1 for first, 2 for second, 3 for both
sub compareset
{
	my ($baga,$bagb)=@_;
	my %count;

	map { $count{$_}=1; } (@{$baga});
	map { $count{$_}+=2; } (@{$bagb});

	return (((grep $count{$_}!=3, keys %count)?0:1),%count);
}
