^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) #!/usr/bin/env perl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) # SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) # Clean a text file -- or directory of text files -- of stealth whitespace.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) # WARNING: this can be a highly destructive operation. Use with caution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) use warnings;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) use bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) use File::Basename;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) # Default options
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) $max_width = 79;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) # Clean up space-tab sequences, either by removing spaces or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) # replacing them with tabs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) sub clean_space_tabs($)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) no bytes; # Tab alignment depends on characters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) my($li) = @_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) my($lo) = '';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) my $pos = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) my $nsp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) my($i, $c);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) for ($i = 0; $i < length($li); $i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) $c = substr($li, $i, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) if ($c eq "\t") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) my $npos = ($pos+$nsp+8) & ~7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) my $ntab = ($npos >> 3) - ($pos >> 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) $lo .= "\t" x $ntab;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) $pos = $npos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) $nsp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) } elsif ($c eq "\n" || $c eq "\r") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) $lo .= " " x $nsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) $pos += $nsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) $nsp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) $lo .= $c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) $pos = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) } elsif ($c eq " ") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) $nsp++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) $lo .= " " x $nsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) $pos += $nsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) $nsp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) $lo .= $c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) $pos++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) $lo .= " " x $nsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) return $lo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) # Compute the visual width of a string
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) sub strwidth($) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) no bytes; # Tab alignment depends on characters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) my($li) = @_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) my($c, $i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) my $pos = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) my $mlen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) for ($i = 0; $i < length($li); $i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) $c = substr($li,$i,1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) if ($c eq "\t") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) $pos = ($pos+8) & ~7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) } elsif ($c eq "\n") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) $mlen = $pos if ($pos > $mlen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) $pos = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) $pos++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) $mlen = $pos if ($pos > $mlen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) return $mlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) $name = basename($0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) @files = ();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) while (defined($a = shift(@ARGV))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) if ($a =~ /^-/) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) if ($a eq '-width' || $a eq '-w') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) $max_width = shift(@ARGV)+0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) print STDERR "Usage: $name [-width #] files...\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) exit 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) push(@files, $a);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) foreach $f ( @files ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) print STDERR "$name: $f\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) if (! -f $f) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) print STDERR "$f: not a file\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) if (!open(FILE, '+<', $f)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) print STDERR "$name: Cannot open file: $f: $!\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) binmode FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) # First, verify that it is not a binary file; consider any file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) # with a zero byte to be a binary file. Is there any better, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) # additional, heuristic that should be applied?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) $is_binary = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) while (read(FILE, $data, 65536) > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) if ($data =~ /\0/) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) $is_binary = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) if ($is_binary) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) print STDERR "$name: $f: binary file\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) seek(FILE, 0, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) $in_bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) $out_bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) $blank_bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) @blanks = ();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) @lines = ();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) $lineno = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) while ( defined($line = <FILE>) ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) $lineno++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) $in_bytes += length($line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) $line =~ s/[ \t\r]*$//; # Remove trailing spaces
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) $line = clean_space_tabs($line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) if ( $line eq "\n" ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) push(@blanks, $line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) $blank_bytes += length($line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) push(@lines, @blanks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) $out_bytes += $blank_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) push(@lines, $line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) $out_bytes += length($line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) @blanks = ();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) $blank_bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) $l_width = strwidth($line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) if ($max_width && $l_width > $max_width) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) print STDERR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) "$f:$lineno: line exceeds $max_width characters ($l_width)\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) # Any blanks at the end of the file are discarded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) if ($in_bytes != $out_bytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) # Only write to the file if changed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) seek(FILE, 0, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) print FILE @lines;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) if ( !defined($where = tell(FILE)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) !truncate(FILE, $where) ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) die "$name: Failed to truncate modified file: $f: $!\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) close(FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) }