python使用正则搜索字符串或文件中的浮点数代码

作者: 新金沙平台  发布:2019-08-02

代码如下:

用python和numpy处理数据次数比较多,写了几个小函数,可以方便地读写数据:

在采集网页信息的时候,经常需要伪造报头来实现采集脚本的有效执行

#coding:gbk
import urllib2,urllib,cookielib,re

'''
 通用的登陆DZ论坛
 参数说明parms:
   username:用户名(必填),
   password :密码(必填),
   domain:网站域名,注意格式必须是:http://www.xxx.xx/(必填),
   answer:问题答案,
   questionid:问题ID,
   referer:跳转地址

 这里使用了可变关键字参数(相关信息可参考手册)
'''
def login_dz(**parms):

  #初始化
  parms_key = ['domain','answer','password','questionid','referer','username']
  arg = {}
  for key in parms_key:
    if key in parms:
      arg[key] = parms[key]
    else:
      arg[key] = ''

  #cookie设置
  cookieFile = './kan_cookies.dat'
  cookie = cookielib.LWPCookieJar()
  opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))

  #获取formhash
  pre_login = arg['domain'] 'member.php?mod=logging&action=login&infloat=yes&handlekey=login&inajax=1&ajaxtarget=fwin_content_login'
  c = opener.open(pre_login).read()
  cookie.save(cookieFile)
  patt = re.compile(r'.*?name="formhash".*?value="(.*?)".*?')
  formhash = patt.search(c)
  if not formhash:
    raise Exception('GET formhash Fail!')
  formhash = formhash.group(1)

  #登陆
  postdata = {
   'answer':arg['answer'],
   'formhash':formhash,
   'password':arg['password'],
   'questionid':0 if arg['questionid']=='' else arg['questionid'],
   'referer':arg['domain'] if arg['referer']=='' else arg['referer'],
   'username':arg['username'],
    }

  postdata = urllib.urlencode(postdata)
  req = urllib2.Request(
    url= arg['domain'] 'member.php?mod=logging&action=login&loginsubmit=yes&handlekey=login&loginhash=LCaB3&inajax=1',
    data=postdata
    )
  c = opener.open(req).read(300)
  flag = '登陆失败 %s'%arg['username']
  if 'succeedhandle_login' in c:
    flag = True
  return flag


#使用例子:基本参数登陆
user='xxx'
pwd='xxx'
dom='http://www.discuz.net/' #另一个测试网站:http://bbs.jb51.net/
try:
  flag = login_dz(username=user,password=pwd,domain=dom)
  print(flag)
except Exception,e:
  print('Error:',e)
# -*- coding: utf-8 -*-
#----------------------------------------------------------------------
# FileName:gettxtdata.py
#功能:读取字符串和文件中的数值数据(浮点数)
#主要提供类似matlab中的dlmread和dlmwrite函数
#同时提供loadtxtdata和savetxtdata函数
#Data: 2013-1-10
#Author:吴徐平
#----------------------------------------------------------------------
import numpy
#----------------------------------------------------------------------
def StringToDoubleArray(String):
  """
  #将字符串中的所有非Double类型的字符全部替换成空格
  #以'#'开头注释直至行尾,都被清空
  #返回一维numpy.array数组

  """ 
  from StringIO import StringIO
  import re

  DataArray=numpy.empty([0],numpy.float64)

  if len(String.strip())>0:
    #清空注释行,都是以'#'开头子字符
    doublestring=re.sub('#.*$', " ", String, count=0, flags=re.IGNORECASE)
    #删除非数字字符      
    doublestring=re.sub('[^0-9.e -]', " ", doublestring, count=0, flags=re.IGNORECASE)
    #去掉不正确的数字格式(代码重复是有必要的)
    doublestring=re.sub('[.e -](?=s)', " ", doublestring, count=0, flags=re.IGNORECASE)
    doublestring=re.sub('[.e -](?=s)', " ", doublestring, count=0, flags=re.IGNORECASE)
    doublestring=re.sub('[e -]$', " ", doublestring, count=0, flags=re.IGNORECASE)
    doublestring=re.sub('[e -]$', " ", doublestring, count=0, flags=re.IGNORECASE)
    #去掉首尾空格
    doublestring=doublestring.strip()
    if len(doublestring)>0:
      StrIOds=StringIO(doublestring)
      DataArray= numpy.genfromtxt(StrIOds)

  return DataArray

#----------------------------------------------------------------------
def GetDoubleListFromString(String):
  """
  #使用换行符分割字符串
  #将字符串中的所有非Double类型的字符全部替换成空格
  #以'#'开头注释直至行尾,都被清空
  #将每一行转换成numpy.array数组
  #返回numpy.array数组的列表

  """ 
  from StringIO import StringIO
  import re

  DoubleList=[]
  StringList=String.split('n')#使用换行符分割字符串
  for Line in StringList:
    if len(Line.strip())>0:
      #清空注释行,都是以'#'开头子字符
      doublestring=re.sub('#.*$', " ", Line, count=0, flags=re.IGNORECASE)
      #删除非数字字符      
      doublestring=re.sub('[^0-9.e -]', " ", doublestring, count=0, flags=re.IGNORECASE)
      #去掉不正确的数字格式(代码重复是有必要的)
      doublestring=re.sub('[.e -](?=s)', " ", doublestring, count=0, flags=re.IGNORECASE)
      doublestring=re.sub('[.e -](?=s)', " ", doublestring, count=0, flags=re.IGNORECASE)
      doublestring=re.sub('[e -]$', " ", doublestring, count=0, flags=re.IGNORECASE)
      doublestring=re.sub('[e -]$', " ", doublestring, count=0, flags=re.IGNORECASE)
      #去掉首尾空格
      doublestring=doublestring.strip()
      if len(doublestring)>0:
        StrIOds=StringIO(doublestring)
        DoubleList.append(numpy.genfromtxt(StrIOds))   
  return DoubleList

#----------------------------------------------------------------------
def GetDoubleListFromFile(FileName):
  """
  #将文本文件中的所有Double类型的字符全部替换成numpy.array数组
  #每一行都是numpy.array数组
  ##返回numpy.array数组的列表
  #注意:返回列表的每个元素又都是一个numpy.array数组
  #注意:返回列表的每个元素(或文件每行)可以包含不同多个数的数字

  """ 
  file=open(FileName, 'r')
  read_file = file.read()
  file.close() 
  DoubleList=GetDoubleListFromString(read_file)
  return DoubleList

def dlmread(FileName,dtype=numpy.float64):
  """
  #Load Data From Txt-File.
  #分隔符默认是:";",",",空格类 (包括t)等等
  #以#开头的被认为是注释,不会被读取
  #Return Value:二维数值数组(numpy.ndarray)
  #对文本中数据的排列格式要求最低,且容许出现注释字符,智能化程度最高,但速度较慢
  """
  DoubleList=GetDoubleListFromFile(FileName)
  dlsize=[]#每一行数组的大小
  for dL in DoubleList:
    dlsize.append(dL.size)

  MinColumnSize=min(dlsize)#数组的最大列数
  MaxColumnSize=max(dlsize)#数组的最小列数
  #数组创建和赋值
  DoubleArray=numpy.empty([len(DoubleList),MinColumnSize],dtype=dtype)

  row=range(0,len(DoubleList))
  colum=range(0,MinColumnSize)

  for i in row:
    for j in colum:
      DoubleArray[i][j]=DoubleList[i][j] 

  return DoubleArray
#----------------------------------------------------------------------

def loadtxtdata(filename,delimiter=""):
  """
  #Load Data From Txt-File with delimiter.
  #分隔符默认是:";",",",空格类 (包括t)和自定义的delimiter等
  #Return Value:  二维数值数组(numpy.ndarray)
  #对文本中数据的排列格式要求较高,且不容许出现注释字符,智能化程度较低,但速度较快
  """
  from StringIO import StringIO
  import re

  file_handle=open(filename,'r')
  LinesALL=file_handle.read()#读入字符串
  file_handle.close()

  DelimiterALL=delimiter ",;"#分隔符
  SpaceString=" "#空格
  for RChar in DelimiterALL:
    LinesALL=LinesALL.replace(RChar,SpaceString)

  return numpy.genfromtxt(StringIO(LinesALL))

#----------------------------------------------------------------------  
def savetxtdata(filename, X, fmt='%.8e', delimiter=' ', newline='n'):
  """
  Save Data To Txt-File.
  """
  numpy.savetxt(filename, X, fmt=fmt, delimiter=delimiter, newline=newline)   
  return True

#----------------------------------------------------------------------
def dlmwrite(filename, X, fmt='%.8e', delimiter=' ', newline='n'):
  """
  Save Data To Txt-File.
  """
  numpy.savetxt(filename, X, fmt=fmt, delimiter=delimiter, newline=newline)   
  return True

#----------------------------------------------------------------------
#测试程序 
#----------------------------------------------------------------------
if __name__ == '__main__':
  #生成随机数
  data=numpy.random.randn(3,4)
  filename='D:/x.txt'
  #写入文件
  dlmwrite(filename,data)
  x=GetDoubleListFromFile(filename)
  print(x)
  print(dlmread(filename))
  y=StringToDoubleArray('79l890joj')
  print(y)
  z=loadtxtdata(filename)
  print(z)

下面,我们将使用urllib2的header部分伪造报头来实现采集信息

本文由新金沙平台发布于新金沙平台,转载请注明出处:python使用正则搜索字符串或文件中的浮点数代码

关键词: 新金沙平台

上一篇:诊所设置,Socket编程入门教程
下一篇:没有了