Sunday, March 09, 2025

Minimal implementations of grep and sed

 I had been thinking about it from quite some time...

Why do I always do a "shell-out" to `grep` and `sed` to edit Linux configuration files, knowing full well that its not good from security standpoint, and not a clean/efficient code! 🤔

Maybe its time, last week while sitting at a Dental clinic, I quickly wrote a minimal version of `grep`, just supporting the most basic operations, the equivalent of:

grep -e or egrep (extended POSIX regex match)

grep -q (quiet 🤫)

grep -F (fixed string match), and

grep -c (count the number of matches)

All I needed was a quick file read (getline) plus POSIX regex match - a.k.a `regcomp()`, `regexec()` and this was it:


(later I decided to throw in "show line number" and "stop on first match" also 😀 why not!? not a big change)

#define _GNU_SOURCE /* for strcasestr */
#include <stdio.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h>
#include <stdarg.h>
#include <stdlib.h>
#include <regex.h>
#include <string.h>
#include <errno.h>
#define MGREP_VERBOSE_DEBUG (1<<0)
#define MGREP_FIXED_STRING (1<<1) /* -F / fgrep */
#define MGREP_IGNORE_CASE (1<<2) /* -i */
#define MGREP_EXTENDED_REGEX (1<<3) /* -E / egrep */
#define MGREP_QUIET (1<<4) /* -q */
#define MGREP_FIRST_MATCH (1<<5) /* -m1 */
#define MGREP_COUNT_ONLY (1<<6) /* -c */
#define MGREP_FILE_LINNUM (1<<7) /* -nH */
bool
min_grep (const char *filename,
uint16_t flags,
const char *match_re,
...)
{
FILE *fp = NULL;
char *line = NULL;
size_t len = 0, linnum = 1;
ssize_t nread = 0;
int match_count = 0, rc = 0,re_cflags = 0;
bool opt_plain_str_match = false;
bool opt_silent = false;
bool opt_stop_at_m1 = false;
bool opt_show_fileline = false;
bool opt_ignore_case = false;
regex_t re_id;
if (!filename || !match_re) {
fprintf(stderr, "No filename|match-regex!\n");
return false;
}
if (!(fp = fopen(filename, "r"))) {
fprintf(stderr, "Failed to open file %s (%s)\n", filename, strerror(errno));
return false;
}
opt_plain_str_match = (flags & MGREP_FIXED_STRING) ? true : false;
opt_silent = (flags & MGREP_COUNT_ONLY) ? true : false;
opt_silent = (flags & MGREP_QUIET) ? true : opt_silent;
opt_show_fileline = (flags & MGREP_FILE_LINNUM) ? true : false;
opt_stop_at_m1= (flags & MGREP_FIRST_MATCH) ? true : false;
if (flags & MGREP_IGNORE_CASE) { re_cflags |= REG_ICASE; opt_ignore_case = true; }
if (flags & MGREP_EXTENDED_REGEX) {
if (opt_plain_str_match) {
fprintf(stderr, "Cannot do both regex and plain string match!\n");
return false;
}
re_cflags |= REG_EXTENDED;
}
if (!opt_plain_str_match && ((rc = regcomp(&re_id, match_re, re_cflags)) != 0)) {
char ebuf[1024] = {0};
regerror(rc, &re_id, ebuf, sizeof ebuf);
fprintf(stderr, "Failed to compile regex! (%s)\n", ebuf);
goto err;
}
while ((nread = getline(&line, &len, fp)) != -1) {
if (opt_plain_str_match) {
if ((opt_ignore_case ? strcasestr : strstr )(line, match_re)) {
if (!opt_silent && opt_show_fileline) printf("%s:%zu:", filename, linnum);
if (!opt_silent) printf("%s", line);
match_count++;
if (opt_stop_at_m1) return true;
}
} else {
if (regexec(&re_id, line, 0, NULL, 0) == 0) {
if (!opt_silent && opt_show_fileline) printf("%s:%zu:", filename, linnum);
if (!opt_silent) printf("%s", line);
match_count++;
if (opt_stop_at_m1) return true;
}
}
linnum++;
}
err:
if (!opt_plain_str_match) regfree(&re_id);
free(line);
fclose(fp);
if (flags & MGREP_COUNT_ONLY) fprintf(stderr, "%d", match_count);
return (match_count > 0) ? true : false;
}
int main (int argc, char *argv[])
{
bool ret = min_grep("/etc/hosts", MGREP_EXTENDED_REGEX, "^127[.]");
printf("\nret: %d\n", ret);
return 0;
}
view raw min_grep.c hosted with ❤ by GitHub

Emboldened by this, I attempted a minimal `sed` to do "inline-replace" (my most common use !)

The only additional checks I needed, was to find the match offsets, and of course captures and replace:

#include <stdio.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h>
#include <stdarg.h>
#include <string.h>
#include <stdlib.h>
#include <regex.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#define MSED_VERBOSE_DEBUG (1<<0)
#define MSED_IGNORE_CASE (1<<1)
#define MSED_EXTENDED_REGEX (1<<2)
#define MSED_REPLACE_INPLACE (1<<3)
#define MSED_FOLLOW_SYMLINKS (1<<4)
bool
min_sed (const char *filename,
uint16_t flags,
const char *match_re,
const char *replace_str,
...)
{
FILE *ifp = NULL, *ofp = NULL;
char *line = NULL;
size_t len = 0;
ssize_t nread;
int match_count = 0, rc = 0;
int re_cflags = 0;
char ofname[PATH_MAX] = {0};
bool opt_replace_inplace = false;
char replacebuf[BUFSIZ] = {0};
regex_t re_id;
regmatch_t pmatch[2];
if (!filename || !match_re) {
fprintf(stderr, "No filename|match-regex!\n");
return false;
}
ifp = fopen(filename, "r");
if (ifp == NULL) {
fprintf(stderr, "Failed to open file %s (%s)\n", filename, strerror(errno));
return false;
}
if (flags & MSED_REPLACE_INPLACE) opt_replace_inplace = true;
if (opt_replace_inplace) {
char linkname[PATH_MAX] = {0};
if (flags & MSED_FOLLOW_SYMLINKS) {
if (readlink(filename, linkname, sizeof linkname) != 0) {
fprintf(stderr, "Failed to read link %s (%s)\n", ofname, strerror(errno));
} else {
// TODO
}
}
sprintf(ofname, "%s~", filename);
ofp = fopen(ofname, "w+");
if (ofp == NULL) {
fprintf(stderr, "Failed to open file %s (%s)\n", ofname, strerror(errno));
return false;
}
}
if (flags & MSED_IGNORE_CASE) { re_cflags |= REG_ICASE; }
if (flags & MSED_EXTENDED_REGEX) { re_cflags |= REG_EXTENDED; }
if ((rc = regcomp(&re_id, match_re, re_cflags)) != 0) {
char ebuf[1024] = {0};
regerror(rc, &re_id, ebuf, sizeof ebuf);
fprintf(stderr, "Failed to compile regex! (%s)\n", ebuf);
}
/* int n; */
while ((nread = getline(&line, &len, ifp)) != -1) {
if (regexec(&re_id, line, ARR_SIZ(pmatch), pmatch, 0) == 0) {
char *rp = strchr(replace_str, '&');
if (rp && (pmatch[1].rm_eo > 0)) {
char new_replace[BUFSIZ] = {0};
snprintf(new_replace, BUFSIZ, "%.*s%.*s%s", (int)(rp - replace_str), replace_str,
(int)(pmatch[1].rm_eo - pmatch[1].rm_so - 1), line + pmatch[1].rm_so ,
rp + 1);
fprintf(opt_replace_inplace ? ofp : stderr, "%s", new_replace);
} else {
if ((pmatch[0].rm_so == 0) && (pmatch[0].rm_eo == nread)) {
/* entire line replace */
fprintf(opt_replace_inplace ? ofp : stderr, "%s\n", replace_str);
} else {
/* part replace */
memset(replacebuf, 0, BUFSIZ);
snprintf(replacebuf, BUFSIZ, "%.*s%s%s", pmatch[0].rm_so, line, replace_str, line + pmatch[0].rm_eo);
fprintf(opt_replace_inplace ? ofp : stderr, "%s", replacebuf);
}
}
match_count++;
} else {
fprintf(opt_replace_inplace ? ofp : stderr, "%s", line);
}
}
regfree(&re_id);
free(line);
fclose(ifp);
if (opt_replace_inplace) {
fflush(ofp);
fclose(ofp);
if (match_count > 0) {
rc = rename(ofname, filename);
} else {
rc = unlink(ofname);
}
return rc == 0 ? true : false;
}
return (match_count > 0) ? true : false;
}
int main (int argc, char *argv[])
{
bool ret = min_sed("/etc/fail2ban/fail2ban.conf",
MSED_EXTENDED_REGEX|MSED_REPLACE_INPLACE,
"^#?ignoreip\\s+=\\s+(.*)", "ignoreip = & 9.9.9.9/18");
printf("\nret: %d\n", ret);
return 0;
}
view raw min_sed.c hosted with ❤ by GitHub


I am saving only 2 set of captures, one for the entire match (implicit, in `pmatch[0]`) and one if the user specifies a capture (`pmatch[1]`)

Note: I know `&` means  "all that matched" and not same as `\1` but since I am not going to add support for multiple captures and replace, this is it!

Thursday, February 27, 2025

 Switch case on strings!


Interesting solution:    https://stackoverflow.com/a/51816519/112687 


#ifndef __SSWITCH_H__

#define __SSWITCH_H__

#include <string.h>

    #ifndef SWITCH_CASE_INIT

    #define SWITCH_CASE_INIT

    #define SWITCH(X)  for(char* __switch_p__=X,__switch_next__=1;__switch_p__;__switch_p__=0,__switch_next__=1) {{

    #define CASE(X)    } if (!__switch_next__ || !(__switch_next__ = strcmp(__switch_p__, X))) {

    #define DEFAULT    } {

    #define END        }}

    #endif

#endif


Wednesday, December 04, 2024

Block comments in bash

In C/C++, to comment out a block of code, I generally use

 #if 0
 ...stuff
 #endif 

 Or block comment (if there are no other comments)

 /* 
 ...stuff
 */ 

 But there is no such thing in bash, so the common hack is

 : '
 ...stuff
 '
 
i.e., use the : (no-op) operator and then the entire block to be commented goes into a single-quoted string! 

But even more simpler is:

 if false; then
 ... stuff
 fi

 why not!? :=)

Tuesday, February 12, 2019

REPL for C!?

Was instructing a colleague on how to use REPL first, when working on languages like Python/Lua, to try the code snippet interactively. And this thought passed through - "why not a simple REPL for C"? And, that's how I got to hack together a quick shell (bash) script which does just that - a REPL for C!
Of course, C is not interpreted or dynamic, and you cannot do many things that you could do in the REPL of a dynamic language. But, this does save some keystrokes when you want to quickly run and check some code snippet.

The script:
https://gist.github.com/aniruddha-a/2877defd751b20a8f4dbbe04b7b6fe43

The readline helper, with history and completion for often used std functions/types:
https://gist.github.com/aniruddha-a/dfca2c5f88fdc1df5591943422fe0349

Wednesday, September 20, 2017

How does Checkinstall work


Today, while trying to build and install some software from source, I discovered
checkinstall
What it does: tries to get what make install does, and then creates a Debian (or rpm etc)
package. At first it might sound like Ah! not much, but think again ...
Makefile can have any kind of recipes, invocation of other shell commands, loops ...
how the #&$% can it know!?

I could not rest until I understood how this cool thing works! As usual, I tried to first Google if I can find some articles, but didn't find any. OK! no problem, I downloaded the sources and grok'd through it.

Here is a brief explanation:

- checkinstall relies on a utility called installwatch
- installwatch traps quite many libc calls like open() chown() unlink() etc and logs the operations in a parseable form
- This info is used by a shell wrapper to translate it to commands/formats for different package managers

Thursday, August 24, 2017

Enable tracing on bash scripts

With shell scripts, it becomes difficult at times to debug which command took too long, or at which step has an issue. Even though I add generous amount of debug logs there are times where -x (execution trace) is invaluable. But I don't want to litter my regular logs with the -x output either!
So, after a bit of search found this nice snippet of bash:

trace_file() {
    exec 4>>"$1"
    BASH_XTRACEFD=4
    set -x
}

With this function defined, I can invoke it in any script, with the path to trace file:

trace_file /tmp/mytrace

Monday, October 10, 2016

How (not) to write a parser

One of my favorite subjects during college days was Parsing and compiler construction (remember The dragon book?)Ever since the first time I tried lex/yacc (flex/bison) it was a mysterious piece of software that I wanted to master using! 

Zoom forward 10 years, a lot has changed in the field: LR(1) is passé, GLR and LL(1) are in-vogue (thanks to tools like ANTLR) 

When I thought about venturing into writing a parser for the YANG modelling language (RFC 6020), I first did some study+survey, and I formed some new opinions about parsing: 
- Don't roll out a RDP in desperation  
- No separate tokenization phase (scanner-less) 
- Not LR(1) - too limited 
- Not ANTLR! (though its a fantastic tool - it leans a lot towards Java!) 
- Not Bison GLR! (no proper documentation!) 
- Use PEG (yet another learning opportunity :-)) 
- Not C! (more about it later) 

PEG is cool, and with a love for regex PEG attracted me way too much :-) I tried many PEG based parser generators. 

To start with I picked up Waxeye , though it has a choice of many targets, the C port is not optimal (author himself told, that more work's needed). 

Then I tried peg/leg, though its very similar to lex/yacc in usage, I was finding it hard to debug/modify the grammar. Part of the problem could have also been the YANG ABNF grammar (as defined in the RFC). There are no reserved words in YANG. I tried peg/leg much more than Waxeye, but eventually I gave up once I realized that I was getting nowhere! So, C is gone - as I had only 2 options that had a C target!

Lua/LPeg: Though I'm familiar with Lua, this was my first tryst with LPeg. I had heard a lot of praise for Roberto's work, and I got to know why, quite soon. Within a few hours I was able to create a matcher (deliberately avoiding the word parser here) that could recognize 80% of the YANG files at my disposal! (had to tweak the matcher for the remaining 20%). This time, I chose a different approach, instead of using the YANG ABNF grammar, I created some simple relaxed rules. 

Here is a brief description of the approach taken.

YANG statements can be either of the form:

1.   identifier [value];

Or:

2.   identifier [value] {
            One or more of 1. (depending on identifier)
  }

Note that value is optional. Since there are no reserved words, it leads to a simple matcher (~30lines!). Even though there are no reserved words, we have some keywords, and based on that we should be able to say whether the nesting is right or wrong – I call this as the validation phase. And, that too, can be simplified to triviality, if we keep the AST as a simple Lua array (table).

For e.g: A leaf can be under a container, but not vice-versa:

Valid:
container C1{
  leaf l1 {
     type string;
  }
}

Invalid:
 leaf l1 {
  container C1 {
  }
}

I keep a list of allowed-children for each type of node. Additional checks are performed with custom functions per node-type. And, since these children are arranged as arrays – we can also ensure order.

That’s it! :-) - take a look at: https://github.com/aniruddha-a/lyang

Comments/feedback welcome.