Clicky

Fortran Wiki
regex_module

Page "Fox" does not exist.
Please create it now, or hit the "back" button in your browser.

This interface illustrates how wrapper functions can simplify the user API, and handle C header declarations that are not readily accessible in Fortran.

Below are sources for the main Fortran module, the C support/wrapper code, and a small example program.

This is only a preliminary version, It would be nice to automatically handle the match array, but the maximum number of matches must still be declared by the caller.

PCRE is a much more powerful pattern library, and may be a better choice than POSIX regex. You may find some wrappers in the projects here: fortran_pcre fregex ftlRegex

module regex_module
!-------------------------------------------------------------------
! Fortran interface to POSIX regex, using ISO_C_BINDING.
!
! Regex is defined as an API using C headers. It does not define the
! exact value of flag tokens, just the names. It also uses an opaque
! data structure and a declared numeric type for the match array.
! Therefore, the code must either be generated for each target
! platform, or it use wrapper functions written in C.
!
! Fortran wrapper functions are also required to present a normal
! Fortran API, ant not require C conversions by the caller.
!
! The interface here is not strictly correct, because it does not
! explicitly convert Fortran strings to the C character kind. 
! Fortran only supports conversion of string kinds by assignment,
! or by a rather slow internal WRITE. For now, the easiest approach
! is to assume that C and Fortran default  character kinds are the
! same. This is generally true, but UTF-8 strings are likely to
! cause problems.
!-------------------------------------------------------------------
! API:
!
! Compile a regex into a regex object
!  subroutine regcomp(this,pattern,flags,status)
!    type(regex_type), intent(out) :: this           ! new regex object
!    character(len=*), intent(in) :: pattern         ! regex pattern string
!    character(len=*), intent(in), &
!                         optional :: flags ! flag characters:
!                                           ! x = extended regex (REG_EXTENDED)
!                                           ! m = multi-line     (REG_NEWLINE)
!                                           ! i = case-insensitive (REG_ICASE)
!                                           ! n = no MATCH required (REG_NOSUB)
!    integer, intent(out), optional :: status ! If absent, errors are fatal
!  end subroutine regcomp
!
! Execute a compiled regex against a string
!  function regexec(this,string,matches,flags,status) result(match)
!    logical :: match ! .TRUE. if the pattern matched
!    type(regex_type), intent(in) :: this ! regex object
!    character(len=*), intent(in) :: string ! target string
!    character(len=*), intent(in), &
!                     optional :: flags ! flag characters (for partial lines):
!                                       ! b = no beginning-of-line (REG_NOTBOL)
!                                       ! e = no end-of-line (REG_NOTEOL)
!    integer, intent(out), optional :: matches(:,:) ! match locations,
!                                                   ! dimension(2,nmatches)
!    integer, intent(out), optional :: status ! If absent, errors are fatal
!  end function
!
! Get the string message for a status error value
!  subroutine regerror(this,errcode,errmsg,errmsg_len)
!    type(regex_type), intent(in) :: this
!    integer, intent(in) :: errcode
!    character, intent(out) :: errmsg
!    integer, intent(out) :: errmsg_len
!    errmsg_len = C_regerror(int(errcode,C_int), this%preg, &
!                 errmsg, int(len(errmsg),C_size_t))
!  end subroutine regerror
!
! Release 
!  subroutine regfree(this)
!    type(regex_type), intent(inout) :: this
!  end subroutine regfree
!-------------------------------------------------------------------
! TODO:
! * More documentation.
! * Implement allocatable-length strings when commonly available.
! * Maybe store the matches array inside the regex_type structure?
!-------------------------------------------------------------------
  use ISO_C_Binding, only: C_ptr, C_int, C_size_t, C_char, &
                           C_NULL_char, C_NULL_ptr
  use ISO_Fortran_Env, only: ERROR_UNIT
! Fortran regex structure holds a pointer to an opaque C structure
  type regex_type
    type(C_ptr) :: preg
  end type regex_type
  interface
    subroutine C_regalloc(preg_return) &
        bind(C,name="C_regalloc")
      import
      type(C_ptr), intent(out) :: preg_return
    end subroutine C_regalloc
    subroutine C_regcomp(preg,pattern,flags,status) &
        bind(C,name="C_regcomp")
      import
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(in) :: pattern(*)
      character(len=1,kind=C_char), intent(in) :: flags(*)
      integer(C_int), intent(inout) :: status
    end subroutine C_regcomp
    subroutine C_regexec(preg,string,nmatch,matches,flags,status) &
        bind(C,name="C_regexec")
      import
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(in) :: string(*)
      integer(C_int), intent(in), value :: nmatch
      integer(C_int), intent(out) :: matches(2,nmatch)
      character(len=1,kind=C_char), intent(in) :: flags(*)
      integer(C_int), intent(out) :: status
    end subroutine C_regexec
    function C_regerror(errcode, preg, errbuf, errbuf_size) &
        result(regerror) bind(C,name="regerror")
      import
      integer(C_size_t) :: regerror
      integer(C_int), value :: errcode
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(out) :: errbuf
      integer(C_size_t), value :: errbuf_size
    end function C_regerror
    subroutine C_regfree(preg) bind(C,name="regfree")
      import
      type(C_ptr), intent(in), value :: preg
    end subroutine C_regfree
  end interface
contains
  subroutine regcomp(this,pattern,flags,status)
    type(regex_type), intent(out) :: this
    character(len=*), intent(in) :: pattern
    character(len=*), intent(in), optional :: flags
    integer, intent(out), optional :: status
! local
    integer(C_int) :: status_
    character(len=10,kind=C_char) :: flags_
! begin
    flags_=' '
    if (present(flags)) flags_=flags
    this%preg = C_NULL_ptr
    call C_regalloc(this%preg)
    call C_regcomp(this%preg, trim(pattern)//C_NULL_char, &
                   trim(flags)//C_NULL_char, status_)
    if (present(status)) then
      status=status_
    else if (status_/=0) then

      stop 'Regex runtime error: regcomp failed.'
    end if
  end subroutine regcomp
  logical function regexec(this,string,matches,flags,status) &
        result(match)
    type(regex_type), intent(in) :: this
    character(len=*), intent(in) :: string
    character(len=*), intent(in), optional :: flags
    integer, intent(out), optional :: matches(:,:)
    integer, intent(out), optional :: status
! local
    integer(C_int) :: status_, matches_(2,1)
    character(len=10,kind=C_char) :: flags_
! begin
    flags_=' '
    if (present(flags)) flags_=flags
    write(*,*) 'calling C, nmatches=',nmatches
    if (present(matches)) then
      call C_regexec(this%preg, trim(string)//C_NULL_char, &
                   size(matches,2),matches, &
                   trim(flags_)//C_NULL_char, status_)
    else
      call C_regexec(this%preg, trim(string)//C_NULL_char, &
                   int(0,C_int),matches_, &
                   trim(flags_)//C_NULL_char, status_)
    end if
    match = status_==0
    if (present(status)) then
      status=status_
    else if (status_/=0.and.status_/=1) then
      stop 'Regex runtime error: regexec failed.'
    end if
  end function regexec
  function regmatch(match,string,matches)
    integer, intent(in) :: match, matches(2,*)
    character(len=*), intent(in) :: string
    character(len=matches(2,match)-matches(1,match)) :: regmatch
    regmatch = string(matches(1,match)+1:matches(2,match))
  end function regmatch
  subroutine regerror(this,errcode,errmsg,errmsg_len)
    type(regex_type), intent(in) :: this
    integer, intent(in) :: errcode
    character, intent(out) :: errmsg
    integer, intent(out) :: errmsg_len
    errmsg_len = C_regerror(int(errcode,C_int), this%preg, &
                 errmsg, int(len(errmsg),C_size_t))
  end subroutine regerror
  subroutine regfree(this)
    type(regex_type), intent(inout) :: this
    call C_regfree(this%preg)
    this%preg = C_NULL_ptr
  end subroutine regfree
end module regex_module
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdlib.h>

void C_regalloc(regex_t **preg_return) {
  *preg_return = malloc(sizeof(**preg_return));
}

/* pattern must be NUL terminated. */
void C_regcomp(regex_t *preg, const char *pattern,
               const char *flags, int *status_return) {
  int i, cflags=0;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'i': cflags |= REG_ICASE; break;
      case 'm': cflags |= REG_NEWLINE; break;
      case 'x': cflags |= REG_EXTENDED; break;
      case 'n': cflags |= REG_NOSUB; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
  *status_return = regcomp(preg,pattern,cflags);
}

void C_regexec(const regex_t *preg, const char *string, int nmatch,
               int matches[nmatch][2], const char *flags,
               int *status_return) {
  int i, eflags=0;
  regmatch_t *pmatch;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'b': eflags |= REG_NOTBOL; break;
      case 'e': eflags |= REG_NOTEOL; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
  if (nmatch>0 && sizeof(pmatch->rm_so)!=sizeof(matches[0][0])) {
    pmatch = malloc(sizeof(regmatch_t)*nmatch);
    *status_return = regexec(preg,string,nmatch,pmatch,eflags);
    for (i=0;i<nmatch;i++) {
      matches[i][0]=pmatch[i].rm_so;
      matches[i][1]=pmatch[i].rm_eo;
    }
    free(pmatch);
  } else {
    *status_return = regexec(preg,string,nmatch,(regmatch_t*)&(matches[0][0]),eflags);
  }
}

A simple example program

program test_program
  use regex_module
  logical match
  integer status, matches(2,1)
  type(regex_type) :: regex
  call regcomp(regex,"\(a..\)")
  match=regexec(regex,"what the?",matches)
  write(*,*) 'match=',match,', status=',status
  write(*,*) 'match="',regmatch(1,"what the?",matches),'"'
  call regfree(regex)
end program test_program

Hello!, I am new to this Fortran Wiki, I do not know how to post a comment so I am editing the page. My apologies if this messes up your great work. I did small modification which I think it is an improvement. I hope it helps.

#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdlib.h>

void C_regalloc(regex_t **preg_return) {
  *preg_return = malloc(sizeof(**preg_return));
}

/* pattern must be NUL terminated. */
void C_regcomp(regex_t *preg, const char *pattern,
               const char *flags, int *status_return) {
  int i, cflags=0;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'i': cflags |= REG_ICASE; break;
      case 'm': cflags |= REG_NEWLINE; break;
      case 'x': cflags |= REG_EXTENDED; break;
      case 'n': cflags |= REG_NOSUB; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
  *status_return = regcomp(preg,pattern,cflags);
}

void C_regexec(const regex_t *preg, const char *string, int nmatch,
               int matches[nmatch][2], const char *flags,
               int *status_return) {
  int i, eflags=0;
  int j=0;
  const char * p = string;
  regmatch_t *pmatch;
  int start;
  int finish;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'b': eflags |= REG_NOTBOL; break;
      case 'e': eflags |= REG_NOTEOL; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
   // Added by Trurl The Constructor
   // Elkin Arroyo
   pmatch = malloc(sizeof(regmatch_t)*nmatch);
   while(1 && j<=nmatch) {
   int no_match;
   *status_return = regexec(preg,p,nmatch,pmatch,eflags);
   if (status_return[0]) {
       break;
   }
   if (pmatch[0].rm_so == -1) {
       break;
   }
   start  = pmatch[0].rm_so + (p - string);
   finish = pmatch[0].rm_eo + (p - string);
   matches[j][0]=start;
   matches[j][1]=finish;
   p +=  pmatch[0].rm_eo;
   j=j+1;
  }
  free(pmatch);
}

This is my modified simple test program

program test_program
  use regex_module
  logical match
  integer nmatch
  integer status, matches(2,10)
  type(regex_type) :: regex
  nmatch=10
  call regcomp(regex,"([0-9\.\-\*\/]+)+",'xm')
  match=regexec(regex,"30*0 250*1 5 6 7",matches)
  DO i=1,nmatch
   write(*,*) 'match="',regmatch(i,"30*0 250*1 5 6 7",matches),'"'
  ENDDO
  call regfree(regex)
end program test_program

This is the results of the program once it ran

 match="30*0"
 match="250*1"
 match="5"
 match="6"
 match="7"
 match=""
 match=""
 match=""
 match=""
 match=""

M.M.: This is what I was looking for and tried it out and discovered slight problems on some test and tried to fix it. I like to contribute with the revised sources:

  • regexp_module: init of matches is included

  • regex.c: iteration to find more than 1 matches is cleaned up

  • test_program: allows for input of rep and text to make test more flexible…

module regex_module
!-------------------------------------------------------------------
! Fortran interface to POSIX regex, using ISO_C_BINDING.
!
! Regex is defined as an API using C headers. It does not define the
! exact value of flag tokens, just the names. It also uses an opaque
! data structure and a declared numeric type for the match array.
! Therefore, the code must either be generated for each target
! platform, or it use wrapper functions written in C.
!
! Fortran wrapper functions are also required to present a normal
! Fortran API, ant not require C conversions by the caller.
!
! The interface here is not strictly correct, because it does not
! explicitly convert Fortran strings to the C character kind. 
! Fortran only supports conversion of string kinds by assignment,
! or by a rather slow internal WRITE. For now, the easiest approach
! is to assume that C and Fortran default  character kinds are the
! same. This is generally true, but UTF-8 strings are likely to
! cause problems.
!-------------------------------------------------------------------
! API:
!
! Compile a regex into a regex object
!  subroutine regcomp(this,pattern,flags,status)
!    type(regex_type), intent(out) :: this           ! new regex object
!    character(len=*), intent(in) :: pattern         ! regex pattern string
!    character(len=*), intent(in), &
!                         optional :: flags ! flag characters:
!                                           ! x = extended regex (REG_EXTENDED)
!                                           ! m = multi-line     (REG_NEWLINE)
!                                           ! i = case-insensitive (REG_ICASE)
!                                           ! n = no MATCH required (REG_NOSUB)
!    integer, intent(out), optional :: status ! If absent, errors are fatal
!  end subroutine regcomp
!
! Execute a compiled regex against a string
!  function regexec(this,string,matches,flags,status) result(match)
!    logical :: match ! .TRUE. if the pattern matched
!    type(regex_type), intent(in) :: this ! regex object
!    character(len=*), intent(in) :: string ! target string
!    character(len=*), intent(in), &
!                     optional :: flags ! flag characters (for partial lines):
!                                       ! b = no beginning-of-line (REG_NOTBOL)
!                                       ! e = no end-of-line (REG_NOTEOL)
!    integer, intent(out), optional :: matches(:,:) ! match locations,
!                                                   ! dimension(2,nmatches)
!    integer, intent(out), optional :: status ! If absent, errors are fatal
!  end function
!
! Get the string message for a status error value
!  subroutine regerror(this,errcode,errmsg,errmsg_len)
!    type(regex_type), intent(in) :: this
!    integer, intent(in) :: errcode
!    character, intent(out) :: errmsg
!    integer, intent(out) :: errmsg_len
!    errmsg_len = C_regerror(int(errcode,C_int), this%preg, &
!                 errmsg, int(len(errmsg),C_size_t))
!  end subroutine regerror
!
! Release 
!  subroutine regfree(this)
!    type(regex_type), intent(inout) :: this
!  end subroutine regfree
!-------------------------------------------------------------------
! TODO:
! * More documentation.
! * Implement allocatable-length strings when commonly available.
! * Maybe store the matches array inside the regex_type structure?
!-------------------------------------------------------------------
  use ISO_C_Binding, only: C_ptr, C_int, C_size_t, C_char, &
                           C_NULL_char, C_NULL_ptr
  use ISO_Fortran_Env, only: ERROR_UNIT

  integer, parameter  :: NO_MATCH = -1
! Fortran regex structure holds a pointer to an opaque C structure
  type regex_type
    type(C_ptr) :: preg
  end type regex_type
  interface
    subroutine C_regalloc(preg_return) &
        bind(C,name="C_regalloc")
      import
      type(C_ptr), intent(out) :: preg_return
    end subroutine C_regalloc
    subroutine C_regcomp(preg,pattern,flags,status) &
        bind(C,name="C_regcomp")
      import
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(in) :: pattern(*)
      character(len=1,kind=C_char), intent(in) :: flags(*)
      integer(C_int), intent(inout) :: status
    end subroutine C_regcomp
    subroutine C_regexec(preg,string,nmatch,matches,flags,status) &
        bind(C,name="C_regexec")
      import
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(in) :: string(*)
      integer(C_int), intent(in), value :: nmatch
      integer(C_int), intent(out) :: matches(2,nmatch)
      character(len=1,kind=C_char), intent(in) :: flags(*)
      integer(C_int), intent(out) :: status
    end subroutine C_regexec
    function C_regerror(errcode, preg, errbuf, errbuf_size) &
        result(regerror) bind(C,name="regerror")
      import
      integer(C_size_t) :: regerror
      integer(C_int), value :: errcode
      type(C_ptr), intent(in), value :: preg
      character(len=1,kind=C_char), intent(out) :: errbuf
      integer(C_size_t), value :: errbuf_size
    end function C_regerror
    subroutine C_regfree(preg) bind(C,name="regfree")
      import
      type(C_ptr), intent(in), value :: preg
    end subroutine C_regfree
  end interface
contains
  subroutine regcomp(this,pattern,flags,status)
    type(regex_type), intent(out) :: this
    character(len=*), intent(in) :: pattern
    character(len=*), intent(in), optional :: flags
    integer, intent(out), optional :: status
! local
    integer(C_int) :: status_
    character(len=10,kind=C_char) :: flags_
! begin
    flags_=' '
    if (present(flags)) flags_=flags
    this%preg = C_NULL_ptr
    call C_regalloc(this%preg)
    call C_regcomp(this%preg, trim(pattern)//C_NULL_char, &
                   trim(flags)//C_NULL_char, status_)
    if (present(status)) then
      status=status_
    else if (status_/=0) then

      stop 'Regex runtime error: regcomp failed.'
    end if
  end subroutine regcomp
  logical function regexec(this,string,matches,flags,status) &
        result(match)
    type(regex_type), intent(in) :: this
    character(len=*), intent(in) :: string
    character(len=*), intent(in), optional :: flags
    integer, intent(out), optional :: matches(:,:)
    integer, intent(out), optional :: status
! local
    integer(C_int) :: status_, matches_(2,1)
    character(len=10,kind=C_char) :: flags_
! begin
    matches = NO_MATCH   !! m.m. added to allow for the extension nmatch > 1
    flags_=' '
    if (present(flags)) flags_=flags
!    write(*,*) 'calling C, nmatches=',size(matches,2)
    if (present(matches)) then
      call C_regexec(this%preg, trim(string)//C_NULL_char, &
                   size(matches,2),matches, &
                   trim(flags_)//C_NULL_char, status_)
    else
      call C_regexec(this%preg, trim(string)//C_NULL_char, &
                   int(0,C_int),matches_, &
                   trim(flags_)//C_NULL_char, status_)
    end if
    match = status_==0
    if (present(status)) then
      status=status_
    else if (status_/=0.and.status_/=1) then
      stop 'Regex runtime error: regexec failed.'
    end if
  end function regexec
  function regmatch(match,string,matches)
    integer, intent(in) :: match, matches(2,*)
    character(len=*), intent(in) :: string
    character(len=matches(2,match)-matches(1,match)) :: regmatch
    regmatch = string(matches(1,match)+1:matches(2,match))
  end function regmatch
  subroutine regerror(this,errcode,errmsg,errmsg_len)
    type(regex_type), intent(in) :: this
    integer, intent(in) :: errcode
    character, intent(out) :: errmsg
    integer, intent(out) :: errmsg_len
    errmsg_len = C_regerror(int(errcode,C_int), this%preg, &
                 errmsg, int(len(errmsg),C_size_t))
  end subroutine regerror
  subroutine regfree(this)
    type(regex_type), intent(inout) :: this
    call C_regfree(this%preg)
    this%preg = C_NULL_ptr
  end subroutine regfree
end module regex_module
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdlib.h>

void C_regalloc(regex_t **preg_return) {
  *preg_return = malloc(sizeof(**preg_return));
}

/* pattern must be NUL terminated. */
void C_regcomp(regex_t *preg, const char *pattern,
               const char *flags, int *status_return) {
  int i, cflags=0;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'i': cflags |= REG_ICASE; break;
      case 'm': cflags |= REG_NEWLINE; break;
      case 'x': cflags |= REG_EXTENDED; break;
      case 'n': cflags |= REG_NOSUB; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
  *status_return = regcomp(preg,pattern,cflags);
}

void C_regexec(const regex_t *preg, const char *string, int nmatch,
               int matches[nmatch][2], const char *flags,
               int *status_return) {
  int i, eflags=0;
  int j=0;
  const char * p = string;
  regmatch_t *pmatch;
  int start;
  int finish;
  for (i=0;flags[i];i++) {
    switch (flags[i]) {
      case 'b': eflags |= REG_NOTBOL; break;
      case 'e': eflags |= REG_NOTEOL; break;
      case ' ': break;
      default: *status_return=-2; return;
    }
  }
   // Added by Trurl The Constructor
   // Elkin Arroyo
   pmatch = malloc(sizeof(regmatch_t));              // m.m. modified
   for(j=0;j<nmatch;j++) {                           // m.m. modified
   *status_return = regexec(preg,p,1,pmatch,eflags); // m.m. modified
   if (status_return[0] == REG_NOMATCH) {            // m.m. modified
       break;
   }
// m.m some lines removed
   start  = pmatch[0].rm_so + (p - string);
   finish = pmatch[0].rm_eo + (p - string);
   matches[j][0]=start;
   matches[j][1]=finish;
   p +=  pmatch[0].rm_eo;
  }
  free(pmatch);
}
program test_program2  ! m.m. modifed
  use regex_module
  implicit none

  integer, parameter        :: nmatch     = 10
  integer, parameter        :: linelength = 200

  logical                   :: match
  integer                   :: i, istatus, matches(2,nmatch)
  type(regex_type)          :: regex
  character(len=linelength) :: inlin, regc, found

  write(*,'(a)',advance="no")"regexp: "
  read (*,'(a)',end=999     ) regc
  DO
    write(*,'(a)',advance="no") "text: "
    read( *,'(a)',end=999     ) inlin
    call regcomp(regex,trim(regc),'xm')       ! trim --> blanks at the end must be specified as [ ]
    match = regexec(regex,trim(inlin),matches,status=istatus)
    FORALL(i=1:len(found))found(i:i)="-"
    DO i=1,nmatch
      if(matches(1,i) == NO_MATCH) exit   
      write(*,*) 'match=',matches(:,i),'"',regmatch(i,trim(inlin),matches),'"'
      found(matches(1,i)+1:matches(2,i)) = regmatch(i,trim(inlin),matches)
    ENDDO
    
    write(*,*)
    write(*,'(a)')trim(inlin)
    write(*,'(a)')trim(found(1:len_trim(inlin)))
    
  ENDDO

  call regfree(regex)

999 stop
end program test_program2