Grepping any type of file encoding in Python

Let’s take handling any encoding of files one step further.

We need to look for specific text in in files in a directory regardless of encoding.  Here is one way in Python.

#! /usr/bin/python
import sys
import os.path
import os
import re
import fnmatch

def DecodeBytes(byteArray, codecs=['utf-8', 'utf-16']):
  for codec in codecs:
    try:
      return byteArray.decode(codec)
    except:
      pass

def ReadLinesFromFile(filename):
  file = open(filename, "rb")
  rawbytes = file.read()
  file.close()
  content = DecodeBytes(rawbytes)
  if content is not None:
    return content.split(os.linesep)

# this came from http://stackoverflow.com/questions/1863236/grep-r-in-python
# with a substitution of ReadLinesFromFile and a file name match filter
def RecursiveGrep(pattern, dir, match):
  r = re.compile(pattern)
  for parent, dnames, fnames in os.walk(dir):
    fnames = fnmatch.filter(fnames, match)
    for fname in fnames:
      filename = os.path.join(parent, fname)
      if os.path.isfile(filename):
        lines = ReadLinesFromFile(filename)
        if lines is not None:
          idx = 0
          for line in lines:
            if r.search(line):
              yield filename + "|" + str(idx) + "|" + line.strip()	
              idx += 1

lines = RecursiveGrep("needle", "\yourpath", "*.cs")

The will recurse all subdirectories, looking in all .cs files to find needed returning the data in this format (pipe separated):

full file path|line number|line content

Very useful on Windows with multilingual files.

Leave a Reply

Your email address will not be published. Required fields are marked *