Clicky

Fortran Wiki
regex_module

This interface illustrates how wrapper functions can simplify the user API, and handle C header declarations that are not readily accessible in Fortran.

Below are sources for the main Fortran module, the C support/wrapper code, and a small example program.

This is only a preliminary version, It would be nice to automatically handle the match array, but the maximum number of matches must still be declared by the caller.

PCRE is a much more powerful pattern library, and may be a better choice than POSIX regex. You may find some wrappers in the projects here: fortran_pcre fregex ftlRegex

module regex_module
!-------------------------------------------------------------------
! Fortran interface to POSIX regex, using ISO_C_BINDING.
!
! Regex is defined as an API using C headers. It does not define the
! exact value of flag tokens, just the names. It also uses an opaque
! data structure and a declared numeric type for the match array.
! Therefore, the code must either be generated for each target
! platform, or it use wrapper functions written in C.
!
! Fortran wrapper functions are also required to present a normal
! Fortran API, ant not require C conversions by the caller.
!
! The interface here is not strictly correct, because it does not
! explicitly convert Fortran strings to the C character kind. 
! Fortran only supports conversion of string kinds by assignment,
! or by a rather slow internal WRITE. For now, the easiest approach
! is to assume that C and Fortran default  character kinds are the
! same. This is generally true, but UTF-8 strings are likely to
! cause problems.
!-------------------------------------------------------------------
! API:
!
! Compile a regex into a regex object
!  subroutine regcomp(this,pattern,flags,status)
!    type(regex_type), intent(out) :: this           ! new regex object
!    character(len=*), intent(in) :: pattern         ! regex pattern string
!    character(len=*), intent(in), &
!                         optional :: flags ! flag characters:
!                                           ! x = extended regex (REG_EXTENDED)
!                                           ! m = multi-line     (REG_NEWLINE)
!                                           ! i = case-insensitive (REG_ICASE)
!                                           ! n = no MATCH required (REG_NOSUB)
!    integer, intent(out), optional :: status ! If absent, errors are fatal
!  end subroutine regcomp
!
! Execute a compiled regex against a string
!  function regexec(this,string,matches,flags,status) result(match)
!    logical :: match ! .TRUE. if the pattern matched
!    type(regex_type), intent(in) :: this ! regex object
!    character(len=*), intent(in) :: string ! target string
!    character(len=*), intent(in), &
!                     optional :: flags ! flag characters (for partial lines):
!                                       ! b = no beginning-of-line (REG_NOTBOL)
!                                       ! e = no end-of-line (REG_NOTEOL)
!    integer, intent(out), optional :: matches(:,:) ! match locations,
!                                                   ! dimension(2,nmatches)
!    integer, intent(out), optional :: status ! If absent, errors are fatal
!  end function
!
! Get the string message for a status error value
!  subroutine regerror(this,errcode,errmsg,errmsg_len)
!    type(regex_type), intent(in) :: this
!    integer, intent(in) :: errcode
!    character, intent(out) :: errmsg
!    integer, intent(out) :: errmsg_len
!    errmsg_len = C_regerror(int(errcode,C_int), this%preg, &
!                 errmsg, int(len(errmsg),C_size_t))
!  end subroutine regerror
!
! Release 
!  subroutine regfree(this)
!    type(regex_type), intent(inout) :: this
!  end subroutine regfree
!-------------------------------------------------------------------
! TODO:
! * More documentation.
! * Implement allocatable-length strings when commonly available.
! * Maybe store the matches array inside the regex_type structure?
!-------------------------------------------------------------------
  use ISO_C_Binding, only: C_ptr, C_int, C_size_t, C_char, &
                           C_NULL_char, C_NULL_ptr
  use ISO_Fortran_Env, only: ERROR_UNIT
! Fortran regex structure holds a pointer to an opaque C structure
  type regex_type
    type(C_ptr) :: preg
  end type regex_type
  interface
    subroutine C_regalloc(preg_return) &
        bind(C,name="C_regalloc")
      import
      type(C_ptr), intent(out) :: preg_return
    end subroutine C_regalloc
    subroutine C_regcomp(preg,pattern,flags,status) &
        bind(C,name="C_regcomp")
      import
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(in) :: pattern(*)
      character(len=1,kind=C_char), intent(in) :: flags(*)
      integer(C_int), intent(inout) :: status
    end subroutine C_regcomp
    subroutine C_regexec(preg,string,nmatch,matches,flags,status) &
        bind(C,name="C_regexec")
      import
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(in) :: string(*)
      integer(C_int), intent(in), value :: nmatch
      integer(C_int), intent(out) :: matches(2,nmatch)
      character(len=1,kind=C_char), intent(in) :: flags(*)
      integer(C_int), intent(out) :: status
    end subroutine C_regexec
    function C_regerror(errcode, preg, errbuf, errbuf_size) &
        result(regerror) bind(C,name="regerror")
      import
      integer(C_size_t) :: regerror
      integer(C_int), value :: errcode
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(out) :: errbuf
      integer(C_size_t), value :: errbuf_size
    end function C_regerror
    subroutine C_regfree(preg) bind(C,name="regfree")
      import
      type(C_ptr), intent(in), value :: preg
    end subroutine C_regfree
  end interface
contains
  subroutine regcomp(this,pattern,flags,status)
    type(regex_type), intent(out) :: this
    character(len=*), intent(in) :: pattern
    character(len=*), intent(in), optional :: flags
    integer, intent(out), optional :: status
! local
    integer(C_int) :: status_
    character(len=10,kind=C_char) :: flags_
! begin
    flags_=' '
    if (present(flags)) flags_=flags
    this%preg = C_NULL_ptr
    call C_regalloc(this%preg)
    call C_regcomp(this%preg, trim(pattern)//C_NULL_char, &
                   trim(flags)//C_NULL_char, status_)
    if (present(status)) then
      status=status_
    else if (status_/=0) then

      stop 'Regex runtime error: regcomp failed.'
    end if
  end subroutine regcomp
  logical function regexec(this,string,matches,flags,status) &
        result(match)
    type(regex_type), intent(in) :: this
    character(len=*), intent(in) :: string
    character(len=*), intent(in), optional :: flags
    integer, intent(out), optional :: matches(:,:)
    integer, intent(out), optional :: status
! local
    integer(C_int) :: status_, matches_(2,1)
    character(len=10,kind=C_char) :: flags_
! begin
    flags_=' '
    if (present(flags)) flags_=flags
    write(*,*) 'calling C, nmatches=',nmatches
    if (present(matches)) then
      call C_regexec(this%preg, trim(string)//C_NULL_char, &
                   size(matches,2),matches, &
                   trim(flags_)//C_NULL_char, status_)
    else
      call C_regexec(this%preg, trim(string)//C_NULL_char, &
                   int(0,C_int),matches_, &
                   trim(flags_)//C_NULL_char, status_)
    end if
    match = status_==0
    if (present(status)) then
      status=status_
    else if (status_/=0.and.status_/=1) then
      stop 'Regex runtime error: regexec failed.'
    end if
  end function regexec
  function regmatch(match,string,matches)
    integer, intent(in) :: match, matches(2,*)
    character(len=*), intent(in) :: string
    character(len=matches(2,match)-matches(1,match)) :: regmatch
    regmatch = string(matches(1,match)+1:matches(2,match))
  end function regmatch
  subroutine regerror(this,errcode,errmsg,errmsg_len)
    type(regex_type), intent(in) :: this
    integer, intent(in) :: errcode
    character, intent(out) :: errmsg
    integer, intent(out) :: errmsg_len
    errmsg_len = C_regerror(int(errcode,C_int), this%preg, &
                 errmsg, int(len(errmsg),C_size_t))
  end subroutine regerror
  subroutine regfree(this)
    type(regex_type), intent(inout) :: this
    call C_regfree(this%preg)
    this%preg = C_NULL_ptr
  end subroutine regfree
end module regex_module
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdlib.h>

void C_regalloc(regex_t **preg_return) {
  *preg_return = malloc(sizeof(**preg_return));
}

/* pattern must be NUL terminated. */
void C_regcomp(regex_t *preg, const char *pattern,
               const char *flags, int *status_return) {
  int i, cflags=0;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'i': cflags |= REG_ICASE; break;
      case 'm': cflags |= REG_NEWLINE; break;
      case 'x': cflags |= REG_EXTENDED; break;
      case 'n': cflags |= REG_NOSUB; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
  *status_return = regcomp(preg,pattern,cflags);
}

void C_regexec(const regex_t *preg, const char *string, int nmatch,
               int matches[nmatch][2], const char *flags,
               int *status_return) {
  int i, eflags=0;
  regmatch_t *pmatch;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'b': eflags |= REG_NOTBOL; break;
      case 'e': eflags |= REG_NOTEOL; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
  if (nmatch>0 && sizeof(pmatch->rm_so)!=sizeof(matches[0][0])) {
    pmatch = malloc(sizeof(regmatch_t)*nmatch);
    *status_return = regexec(preg,string,nmatch,pmatch,eflags);
    for (i=0;i<nmatch;i++) {
      matches[i][0]=pmatch[i].rm_so;
      matches[i][1]=pmatch[i].rm_eo;
    }
    free(pmatch);
  } else {
    *status_return = regexec(preg,string,nmatch,(regmatch_t*)&(matches[0][0]),eflags);
  }
}

A simple example program

program test_program
  use regex_module
  logical match
  integer status, matches(2,1)
  type(regex_type) :: regex
  call regcomp(regex,"\(a..\)")
  match=regexec(regex,"what the?",matches)
  write(*,*) 'match=',match,', status=',status
  write(*,*) 'match="',regmatch(1,"what the?",matches),'"'
  call regfree(regex)
end program test_program

Hello!, I am new to this Fortran Wiki, I do not know how to post a comment so I am editing the page. My apologies if this messes up your great work. I did small modification which I think it is an improvement. I hope it helps.

#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdlib.h>

void C_regalloc(regex_t **preg_return) {
  *preg_return = malloc(sizeof(**preg_return));
}

/* pattern must be NUL terminated. */
void C_regcomp(regex_t *preg, const char *pattern,
               const char *flags, int *status_return) {
  int i, cflags=0;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'i': cflags |= REG_ICASE; break;
      case 'm': cflags |= REG_NEWLINE; break;
      case 'x': cflags |= REG_EXTENDED; break;
      case 'n': cflags |= REG_NOSUB; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
  *status_return = regcomp(preg,pattern,cflags);
}

void C_regexec(const regex_t *preg, const char *string, int nmatch,
               int matches[nmatch][2], const char *flags,
               int *status_return) {
  int i, eflags=0;
  int j=0;
  const char * p = string;
  regmatch_t *pmatch;
  int start;
  int finish;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'b': eflags |= REG_NOTBOL; break;
      case 'e': eflags |= REG_NOTEOL; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
   // Added by Trurl The Constructor
   // Elkin Arroyo
   pmatch = malloc(sizeof(regmatch_t)*nmatch);
   while(1 && j<=nmatch) {
   int no_match;
   *status_return = regexec(preg,p,nmatch,pmatch,eflags);
   if (status_return[0]) {
       break;
   }
   if (pmatch[0].rm_so == -1) {
       break;
   }
   start  = pmatch[0].rm_so + (p - string);
   finish = pmatch[0].rm_eo + (p - string);
   matches[j][0]=start;
   matches[j][1]=finish;
   p +=  pmatch[0].rm_eo;
   j=j+1;
  }
  free(pmatch);
}

This is my modified simple test program

program test_program
  use regex_module
  logical match
  integer nmatch
  integer status, matches(2,10)
  type(regex_type) :: regex
  nmatch=10
  call regcomp(regex,"([0-9\.\-\*\/]+)+",'xm')
  match=regexec(regex,"30*0 250*1 5 6 7",matches)
  DO i=1,nmatch
   write(*,*) 'match="',regmatch(i,"30*0 250*1 5 6 7",matches),'"'
  ENDDO
  call regfree(regex)
end program test_program

This is the results of the program once it ran

 match="30*0"
 match="250*1"
 match="5"
 match="6"
 match="7"
 match=""
 match=""
 match=""
 match=""
 match=""

M.M.: This is what I was looking for and tried it out and discovered slight problems on some test and tried to fix it. I like to contribute with the revised sources:

  • regexp_module: init of matches is included

  • regex.c: iteration to find more than 1 matches is cleaned up

  • test_program: allows for input of rep and text to make test more flexible…

module regex_module
!-------------------------------------------------------------------
! Fortran interface to POSIX regex, using ISO_C_BINDING.
!
! Regex is defined as an API using C headers. It does not define the
! exact value of flag tokens, just the names. It also uses an opaque
! data structure and a declared numeric type for the match array.
! Therefore, the code must either be generated for each target
! platform, or it use wrapper functions written in C.
!
! Fortran wrapper functions are also required to present a normal
! Fortran API, ant not require C conversions by the caller.
!
! The interface here is not strictly correct, because it does not
! explicitly convert Fortran strings to the C character kind. 
! Fortran only supports conversion of string kinds by assignment,
! or by a rather slow internal WRITE. For now, the easiest approach
! is to assume that C and Fortran default  character kinds are the
! same. This is generally true, but UTF-8 strings are likely to
! cause problems.
!-------------------------------------------------------------------
! API:
!
! Compile a regex into a regex object
!  subroutine regcomp(this,pattern,flags,status)
!    type(regex_type), intent(out) :: this           ! new regex object
!    character(len=*), intent(in) :: pattern         ! regex pattern string
!    character(len=*), intent(in), &
!                         optional :: flags ! flag characters:
!                                           ! x = extended regex (REG_EXTENDED)
!                                           ! m = multi-line     (REG_NEWLINE)
!                                           ! i = case-insensitive (REG_ICASE)
!                                           ! n = no MATCH required (REG_NOSUB)
!    integer, intent(out), optional :: status ! If absent, errors are fatal
!  end subroutine regcomp
!
! Execute a compiled regex against a string
!  function regexec(this,string,matches,flags,status) result(match)
!    logical :: match ! .TRUE. if the pattern matched
!    type(regex_type), intent(in) :: this ! regex object
!    character(len=*), intent(in) :: string ! target string
!    character(len=*), intent(in), &
!                     optional :: flags ! flag characters (for partial lines):
!                                       ! b = no beginning-of-line (REG_NOTBOL)
!                                       ! e = no end-of-line (REG_NOTEOL)
!    integer, intent(out), optional :: matches(:,:) ! match locations,
!                                                   ! dimension(2,nmatches)
!    integer, intent(out), optional :: status ! If absent, errors are fatal
!  end function
!
! Get the string message for a status error value
!  subroutine regerror(this,errcode,errmsg,errmsg_len)
!    type(regex_type), intent(in) :: this
!    integer, intent(in) :: errcode
!    character, intent(out) :: errmsg
!    integer, intent(out) :: errmsg_len
!    errmsg_len = C_regerror(int(errcode,C_int), this%preg, &
!                 errmsg, int(len(errmsg),C_size_t))
!  end subroutine regerror
!
! Release 
!  subroutine regfree(this)
!    type(regex_type), intent(inout) :: this
!  end subroutine regfree
!-------------------------------------------------------------------
! TODO:
! * More documentation.
! * Implement allocatable-length strings when commonly available.
! * Maybe store the matches array inside the regex_type structure?
!-------------------------------------------------------------------
  use ISO_C_Binding, only: C_ptr, C_int, C_size_t, C_char, &
                           C_NULL_char, C_NULL_ptr
  use ISO_Fortran_Env, only: ERROR_UNIT

  integer, parameter  :: NO_MATCH = -1
! Fortran regex structure holds a pointer to an opaque C structure
  type regex_type
    type(C_ptr) :: preg
  end type regex_type
  interface
    subroutine C_regalloc(preg_return) &
        bind(C,name="C_regalloc")
      import
      type(C_ptr), intent(out) :: preg_return
    end subroutine C_regalloc
    subroutine C_regcomp(preg,pattern,flags,status) &
        bind(C,name="C_regcomp")
      import
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(in) :: pattern(*)
      character(len=1,kind=C_char), intent(in) :: flags(*)
      integer(C_int), intent(inout) :: status
    end subroutine C_regcomp
    subroutine C_regexec(preg,string,nmatch,matches,flags,status) &
        bind(C,name="C_regexec")
      import
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(in) :: string(*)
      integer(C_int), intent(in), value :: nmatch
      integer(C_int), intent(out) :: matches(2,nmatch)
      character(len=1,kind=C_char), intent(in) :: flags(*)
      integer(C_int), intent(out) :: status
    end subroutine C_regexec
    function C_regerror(errcode, preg, errbuf, errbuf_size) &
        result(regerror) bind(C,name="regerror")
      import
      integer(C_size_t) :: regerror
      integer(C_int), value :: errcode
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(out) :: errbuf
      integer(C_size_t), value :: errbuf_size
    end function C_regerror
    subroutine C_regfree(preg) bind(C,name="regfree")
      import
      type(C_ptr), intent(in), value :: preg
    end subroutine C_regfree
  end interface
contains
  subroutine regcomp(this,pattern,flags,status)
    type(regex_type), intent(out) :: this
    character(len=*), intent(in) :: pattern
    character(len=*), intent(in), optional :: flags
    integer, intent(out), optional :: status
! local
    integer(C_int) :: status_
    character(len=10,kind=C_char) :: flags_
! begin
    flags_=' '
    if (present(flags)) flags_=flags
    this%preg = C_NULL_ptr
    call C_regalloc(this%preg)
    call C_regcomp(this%preg, trim(pattern)//C_NULL_char, &
                   trim(flags)//C_NULL_char, status_)
    if (present(status)) then
      status=status_
    else if (status_/=0) then

      stop 'Regex runtime error: regcomp failed.'
    end if
  end subroutine regcomp
  logical function regexec(this,string,matches,flags,status) &
        result(match)
    type(regex_type), intent(in) :: this
    character(len=*), intent(in) :: string
    character(len=*), intent(in), optional :: flags
    integer, intent(out), optional :: matches(:,:)
    integer, intent(out), optional :: status
! local
    integer(C_int) :: status_, matches_(2,1)
    character(len=10,kind=C_char) :: flags_
! begin
    matches = NO_MATCH   !! m.m. added to allow for the extension nmatch > 1
    flags_=' '
    if (present(flags)) flags_=flags
!    write(*,*) 'calling C, nmatches=',size(matches,2)
    if (present(matches)) then
      call C_regexec(this%preg, trim(string)//C_NULL_char, &
                   size(matches,2),matches, &
                   trim(flags_)//C_NULL_char, status_)
    else
      call C_regexec(this%preg, trim(string)//C_NULL_char, &
                   int(0,C_int),matches_, &
                   trim(flags_)//C_NULL_char, status_)
    end if
    match = status_==0
    if (present(status)) then
      status=status_
    else if (status_/=0.and.status_/=1) then
      stop 'Regex runtime error: regexec failed.'
    end if
  end function regexec
  function regmatch(match,string,matches)
    integer, intent(in) :: match, matches(2,*)
    character(len=*), intent(in) :: string
    character(len=matches(2,match)-matches(1,match)) :: regmatch
    regmatch = string(matches(1,match)+1:matches(2,match))
  end function regmatch
  subroutine regerror(this,errcode,errmsg,errmsg_len)
    type(regex_type), intent(in) :: this
    integer, intent(in) :: errcode
    character, intent(out) :: errmsg
    integer, intent(out) :: errmsg_len
    errmsg_len = C_regerror(int(errcode,C_int), this%preg, &
                 errmsg, int(len(errmsg),C_size_t))
  end subroutine regerror
  subroutine regfree(this)
    type(regex_type), intent(inout) :: this
    call C_regfree(this%preg)
    this%preg = C_NULL_ptr
  end subroutine regfree
end module regex_module
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdlib.h>

void C_regalloc(regex_t **preg_return) {
  *preg_return = malloc(sizeof(**preg_return));
}

/* pattern must be NUL terminated. */
void C_regcomp(regex_t *preg, const char *pattern,
               const char *flags, int *status_return) {
  int i, cflags=0;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'i': cflags |= REG_ICASE; break;
      case 'm': cflags |= REG_NEWLINE; break;
      case 'x': cflags |= REG_EXTENDED; break;
      case 'n': cflags |= REG_NOSUB; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
  *status_return = regcomp(preg,pattern,cflags);
}

void C_regexec(const regex_t *preg, const char *string, int nmatch,
               int matches[nmatch][2], const char *flags,
               int *status_return) {
  int i, eflags=0;
  int j=0;
  const char * p = string;
  regmatch_t *pmatch;
  int start;
  int finish;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'b': eflags |= REG_NOTBOL; break;
      case 'e': eflags |= REG_NOTEOL; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
   // Added by Trurl The Constructor
   // Elkin Arroyo
   pmatch = malloc(sizeof(regmatch_t));              // m.m. modified
   for(j=0;j<nmatch;j++) {                           // m.m. modified
   *status_return = regexec(preg,p,1,pmatch,eflags); // m.m. modified
   if (status_return[0] == REG_NOMATCH) {            // m.m. modified
       break;
   }
// m.m some lines removed
   start  = pmatch[0].rm_so + (p - string);
   finish = pmatch[0].rm_eo + (p - string);
   matches[j][0]=start;
   matches[j][1]=finish;
   p +=  pmatch[0].rm_eo;
  }
  free(pmatch);
}
program test_program2  ! m.m. modifed
  use regex_module
  implicit none

  integer, parameter        :: nmatch     = 10
  integer, parameter        :: linelength = 200

  logical                   :: match
  integer                   :: i, istatus, matches(2,nmatch)
  type(regex_type)          :: regex
  character(len=linelength) :: inlin, regc, found

  write(*,'(a)',advance="no")"regexp: "
  read (*,'(a)',end=999     ) regc
  DO
    write(*,'(a)',advance="no") "text: "
    read( *,'(a)',end=999     ) inlin
    call regcomp(regex,trim(regc),'xm')       ! trim --> blanks at the end must be specified as [ ]
    match = regexec(regex,trim(inlin),matches,status=istatus)
    FORALL(i=1:len(found))found(i:i)="-"
    DO i=1,nmatch
      if(matches(1,i) == NO_MATCH) exit   
      write(*,*) 'match=',matches(:,i),'"',regmatch(i,trim(inlin),matches),'"'
      found(matches(1,i)+1:matches(2,i)) = regmatch(i,trim(inlin),matches)
    ENDDO
    
    write(*,*)
    write(*,'(a)')trim(inlin)
    write(*,'(a)')trim(found(1:len_trim(inlin)))
    
  ENDDO

  call regfree(regex)

999 stop
end program test_program2