encoding | Geek Droppings

Let’s take handling any encoding of files one step further.

We need to look for specific text in in files in a directory regardless of encoding. Here is one way in Python.

#! /usr/bin/python
import sys
import os.path
import os
import re
import fnmatch

def DecodeBytes(byteArray, codecs=['utf-8', 'utf-16']):
  for codec in codecs:
    try:
      return byteArray.decode(codec)
    except:
      pass

def ReadLinesFromFile(filename):
  file = open(filename, "rb")
  rawbytes = file.read()
  file.close()
  content = DecodeBytes(rawbytes)
  if content is not None:
    return content.split(os.linesep)

# this came from http://stackoverflow.com/questions/1863236/grep-r-in-python
# with a substitution of ReadLinesFromFile and a file name match filter
def RecursiveGrep(pattern, dir, match):
  r = re.compile(pattern)
  for parent, dnames, fnames in os.walk(dir):
    fnames = fnmatch.filter(fnames, match)
    for fname in fnames:
      filename = os.path.join(parent, fname)
      if os.path.isfile(filename):
        lines = ReadLinesFromFile(filename)
        if lines is not None:
          idx = 0
          for line in lines:
            if r.search(line):
              yield filename + "|" + str(idx) + "|" + line.strip()	
              idx += 1

lines = RecursiveGrep("needle", "\yourpath", "*.cs")

The will recurse all subdirectories, looking in all .cs files to find needed returning the data in this format (pipe separated):

full file path|line number|line content

Very useful on Windows with multilingual files.

import os def DecodeBytes(byteArray, codecs=['utf-8', 'utf-16']): for codec in codecs: try: return byteArray.decode(codec) except: pass def ReadLinesFromFile(filename): file = open(filename, "rb") rawbytes = file.read() file.close() content = DecodeBytes(rawbytes) if content is not None: return content.split(os.linesep) lines = ReadLinesFromFile("poo.txt") for line in lines: dosomething(line)

Geek Droppings

Tag Archives: encoding

Grepping any type of file encoding in Python

Getting lines of a file of any encoding type in Python

The poop on being a geek