Search code examples
unixrustcompilationpipebroken-pipe

Simple word count rust program outputs valid stdout but panicks when piped to head program with specific content


I have this trace in rust :


thread 'main' panicked at 'failed printing to stdout: Broken pipe (os error 32)', library/std/src/io/stdio.rs:993:9
stack backtrace:
   0:     0x559ffa959dc0 - std::backtrace_rs::backtrace::libunwind::trace::h72c2fb8038f1bbee
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/../../backtrace/src/backtrace/libunwind.rs:96
   1:     0x559ffa959dc0 - std::backtrace_rs::backtrace::trace_unsynchronized::h1e3b084883f1e78c
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/../../backtrace/src/backtrace/mod.rs:66
   2:     0x559ffa959dc0 - std::sys_common::backtrace::_print_fmt::h3bf6a7ebf7f0394a
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:79
   3:     0x559ffa959dc0 - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h2e8cb764b7fe02e7
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:58
   4:     0x559ffa972f6c - core::fmt::write::h7a1184eaee6a8644
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/core/src/fmt/mod.rs:1080
   5:     0x559ffa957b12 - std::io::Write::write_fmt::haeeb374d93a67eac
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/io/mod.rs:1516
   6:     0x559ffa95beed - std::sys_common::backtrace::_print::h1d14a7f6ad632dc8
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:61
   7:     0x559ffa95beed - std::sys_common::backtrace::print::h301abac8bb2e3e81
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:48
   8:     0x559ffa95beed - std::panicking::default_hook::{{closure}}::hde0cb80358a6920a
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:208
   9:     0x559ffa95bb98 - std::panicking::default_hook::h9b1a691049a0ec8f
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:227
  10:     0x559ffa95c5d1 - std::panicking::rust_panic_with_hook::h2bdec87b60580584
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:577
  11:     0x559ffa95c179 - std::panicking::begin_panic_handler::{{closure}}::h101ca09d9df5db47
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:484
  12:     0x559ffa95a22c - std::sys_common::backtrace::__rust_end_short_backtrace::h3bb85654c20113ca
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:153
  13:     0x559ffa95c139 - rust_begin_unwind
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:483
  14:     0x559ffa95c0eb - std::panicking::begin_panic_fmt::hf0503558fbe5b251
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:437
  15:     0x559ffa957022 - std::io::stdio::print_to::h9435376f36962f3f
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/io/stdio.rs:993
  16:     0x559ffa957022 - std::io::stdio::_print::h0d31d4b9faa6e1ec
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/io/stdio.rs:1005
  17:     0x559ffa944807 - wordstats::main::h1c2ea6400047a5eb
  18:     0x559ffa942e73 - std::sys_common::backtrace::__rust_begin_short_backtrace::h9e31cf87ddc88116
  19:     0x559ffa942e49 - std::rt::lang_start::{{closure}}::h6c6491f05894818f
  20:     0x559ffa95c9f7 - core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once::he179d32a5d10d957
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/core/src/ops/function.rs:259
  21:     0x559ffa95c9f7 - std::panicking::try::do_call::hcb3d5e7be089b2b4
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:381
  22:     0x559ffa95c9f7 - std::panicking::try::h7ac93b0cd56fb701
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:345
  23:     0x559ffa95c9f7 - std::panic::catch_unwind::h7b40e396c93a4fcd
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panic.rs:382
  24:     0x559ffa95c9f7 - std::rt::lang_start_internal::h142b9cc66267fea1
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/rt.rs:51
  25:     0x559ffa944ae2 - main
  26:     0x7f6223a380b3 - __libc_start_main
  27:     0x559ffa94209e - _start
  28:                0x0 - <unknown>

when I compile this program

use diacritics;
use std::collections::HashMap;
use std::io;
use std::io::prelude::*;

#[derive(Debug)]
struct Entry {
    word: String,
    count: u32,
}

static SEPARATORS: &'static [char] = &[
    ' ', ',', '.', '!', '?', '\'', '"', '\n', '(', ')', '#', '{', '}', '[', ']', '-', ';', ':',
];

fn main() {
    let mut words: HashMap<String, u32> = HashMap::new();
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
        line_processor(line.unwrap(), &mut words)
    }
    output(&mut words);
}

fn line_processor(line: String, words: &mut HashMap<String, u32>) {
    let formatted_line;
    let mut word = String::new();
    formatted_line = diacritics::remove_diacritics(&line).to_lowercase();

    for c in formatted_line.chars() {
        if SEPARATORS.contains(&c) {
            add_word(word, words);
            word = String::new();
        } else {
            word.push_str(&c.to_string());
        }
    }
}

fn add_word(word: String, words: &mut HashMap<String, u32>) {
    if word.len() > 0 {
        if words.contains_key::<str>(&word) {
            words.insert(word.to_string(), words.get(&word).unwrap() + 1);
        } else {
            words.insert(word.to_string(), 1);
        }
        // println!("word >{}<", word.to_string())
    }
}

fn output(words: &mut HashMap<String, u32>) {
    let mut stack = Vec::<Entry>::new();

    for (k, v) in words {
        stack.push(Entry {
            word: k.to_string(),
            count: *v,
        });
    }

    stack.sort_by(|a, b| b.count.cmp(&a.count));
    stack.reverse();

    while let Some(entry) = stack.pop() {
        println!("{}\t{}", entry.count, entry.word);
    }
}

this way :

cargo build --release

and I run the program like this :

cat src/sample.txt | ./target/release/wordstats  | head -n 50

This program should just show something like this (top word count) with no trace :

15  the
14  in
11  are
10  and
10  of
9   species
9   bats
8   horseshoe
8   is
6   or
6   as
5   which
5   their

This is the case with some echoed content , or some other files (e.g. cat src/main.rs | ...) but not for this file content which is a part of a random wikipedia page.

My program is a stupid word count that just print a tabular sorted key values list.

The issue occurs when I pipe the result in the head -n 50 program but not when I print the full output

Any Idea why I get such an trace ? Do I handle something the wrong way in my program or could it be related to something else (rust lib / unix misbehavior)

My rustc version is : rustc 1.48.0 (7eac88abb 2020-11-16)

Edit :

Add missing Cargo.toml

[package]
name = "wordstats"
version = "0.1.0"
authors = ["Eric Régnier <utopman@gmail.com>"]
edition = "2018"

[dependencies]
diacritics = "0.1.1"

Solution

  • Firstly, you didn't provide enough information to reproduce your problem. You provided source code that uses a third party dependency but neglected to provide a Cargo.toml. In your case, it was very easy to remove the use of the dependency without impacting the problem at hand, so that's what I did.

    Secondly, using println! in non-toy command line programs is a footgun for precisely this reason. Namely, there are two issues that combine together to produce this undesirable behavior:

    1. println! will panic if any error occurs while writing to stdout.
    2. One of the few things Rust's runtime does is ignore SIGPIPE, which means instead of your application getting sent a PIPE signal, the corresponding write to the file descriptor that was closed returns an error instead. (In that link, you can see that I am on record as advocating for a change in this behavior.)

    In a typical C program, SIGPIPE is not ignored. It is also typically not explicitly handled either. When a process is sent a signal that it doesn't handle, the process terminates. And that's exactly what you want in cases like this. Once head stops reading its stdin (your stdout), you want your program to stop, but you also want it to stop gracefully without panicking or printing an error. Because that's what Unix CLI utilities do.

    You have two ways to solve this problem. One way is to change your code to handle BrokenPipe errors explicitly. Your code is written in a way that acts as if errors can't happen, since you unwrap the result of reading stdout. So your program is not idiomatic and not setup to handle errors. So in order to deal with BrokenPipe correctly, I had to make a couple small changes so that it bubbled up errors correctly:

    use std::collections::HashMap;
    use std::io;
    use std::io::prelude::*;
    
    #[derive(Debug)]
    struct Entry {
        word: String,
        count: u32,
    }
    
    static SEPARATORS: &'static [char] = &[
        ' ', ',', '.', '!', '?', '\'', '"', '\n', '(', ')', '#', '{', '}', '[', ']', '-', ';', ':',
    ];
    
    fn main() {
        if let Err(err) = try_main() {
            if err.kind() == std::io::ErrorKind::BrokenPipe {
                return;
            }
            // Ignore any error that may occur while writing to stderr.
            let _ = writeln!(std::io::stderr(), "{}", err);
        }
    }
    
    fn try_main() -> Result<(), std::io::Error> {
        let mut words: HashMap<String, u32> = HashMap::new();
        let stdin = io::stdin();
        for result in stdin.lock().lines() {
            let line = result?;
            line_processor(line, &mut words)
        }
        output(&mut words)?;
        Ok(())
    }
    
    fn line_processor(line: String, words: &mut HashMap<String, u32>) {
        let mut word = String::new();
    
        for c in line.chars() {
            if SEPARATORS.contains(&c) {
                add_word(word, words);
                word = String::new();
            } else {
                word.push_str(&c.to_string());
            }
        }
    }
    
    fn add_word(word: String, words: &mut HashMap<String, u32>) {
        if word.len() > 0 {
            if words.contains_key::<str>(&word) {
                words.insert(word.to_string(), words.get(&word).unwrap() + 1);
            } else {
                words.insert(word.to_string(), 1);
            }
            // println!("word >{}<", word.to_string())
        }
    }
    
    fn output(words: &mut HashMap<String, u32>) -> Result<(), std::io::Error> {
        let mut stack = Vec::<Entry>::new();
    
        for (k, v) in words {
            stack.push(Entry {
                word: k.to_string(),
                count: *v,
            });
        }
    
        stack.sort_by(|a, b| b.count.cmp(&a.count));
        stack.reverse();
    
        let stdout = io::stdout();
        let mut stdout = stdout.lock();
        while let Some(entry) = stack.pop() {
            writeln!(stdout, "{}\t{}", entry.count, entry.word)?;
        }
        Ok(())
    }
    

    The second way of handling this is to go back to the default behavior of SIGPIPE. This will cause your Rust application to behave like a C application. That can be accomplished by defining a function to reset the signal handler for SIGPIPE to SIG_DFL:

    #[cfg(unix)]
    fn reset_sigpipe() {
        unsafe {
            libc::signal(libc::SIGPIPE, libc::SIG_DFL);
        }
    }
    
    #[cfg(not(unix))]
    fn reset_sigpipe() {
        // no-op
    }
    

    And then call it as the first thing in main. Then you can remove any specific handling of the BrokenPipe error because it won't occur. Instead, your process will be sent a PIPE signal and it will subsequently terminate. Here's the full code:

    use std::collections::HashMap;
    use std::io;
    use std::io::prelude::*;
    
    #[derive(Debug)]
    struct Entry {
        word: String,
        count: u32,
    }
    
    static SEPARATORS: &'static [char] = &[
        ' ', ',', '.', '!', '?', '\'', '"', '\n', '(', ')', '#', '{', '}', '[', ']', '-', ';', ':',
    ];
    
    fn main() {
        if let Err(err) = try_main() {
            let _ = writeln!(std::io::stderr(), "{}", err);
        }
    }
    
    fn try_main() -> Result<(), std::io::Error> {
        reset_sigpipe();
        
        let mut words: HashMap<String, u32> = HashMap::new();
        let stdin = io::stdin();
        for result in stdin.lock().lines() {
            let line = result?;
            line_processor(line, &mut words)
        }
        output(&mut words)?;
        Ok(())
    }
    
    fn line_processor(line: String, words: &mut HashMap<String, u32>) {
        let mut word = String::new();
    
        for c in line.chars() {
            if SEPARATORS.contains(&c) {
                add_word(word, words);
                word = String::new();
            } else {
                word.push_str(&c.to_string());
            }
        }
    }
    
    fn add_word(word: String, words: &mut HashMap<String, u32>) {
        if word.len() > 0 {
            if words.contains_key::<str>(&word) {
                words.insert(word.to_string(), words.get(&word).unwrap() + 1);
            } else {
                words.insert(word.to_string(), 1);
            }
            // println!("word >{}<", word.to_string())
        }
    }
    
    fn output(words: &mut HashMap<String, u32>) -> Result<(), std::io::Error> {
        let mut stack = Vec::<Entry>::new();
    
        for (k, v) in words {
            stack.push(Entry {
                word: k.to_string(),
                count: *v,
            });
        }
    
        stack.sort_by(|a, b| b.count.cmp(&a.count));
        stack.reverse();
    
        let stdout = io::stdout();
        let mut stdout = stdout.lock();
        while let Some(entry) = stack.pop() {
            writeln!(stdout, "{}\t{}", entry.count, entry.word)?;
        }
        Ok(())
    }
    
    
    #[cfg(unix)]
    fn reset_sigpipe() {
        unsafe {
            libc::signal(libc::SIGPIPE, libc::SIG_DFL);
        }
    }
    
    #[cfg(not(unix))]
    fn reset_sigpipe() {
        // no-op
    }