Php 需要帮助分析HTTP表单吗
Php 需要帮助分析HTTP表单吗,php,html,post,curl,Php,Html,Post,Curl,为了训练自己使用php和HTML表单,我决定制作一个小型web应用程序,它可以从另一个网站收集数据,但可以在移动设备上显示数据 在本次练习中,我选择了我所在地区的公交公司所在地:。我分析了网站,找到了名为“form1”的表单,它通过POST方法向网站发送数据 我开始编写php代码,在互联网上发现可以使用cURL发送帖子字段。所以我做了。不幸的是,它不起作用。我得到了网站的错误页面。所以我猜肯定有一些字段丢失了,但我已经检查了所有内容,找不到其他字段。就这样,我再次来到这里,寻求帮助 web应用程
为了训练自己使用php和HTML表单,我决定制作一个小型web应用程序,它可以从另一个网站收集数据,但可以在移动设备上显示数据 在本次练习中,我选择了我所在地区的公交公司所在地:。我分析了网站,找到了名为“form1”的表单,它通过POST方法向网站发送数据 我开始编写php代码,在互联网上发现可以使用cURL发送帖子字段。所以我做了。不幸的是,它不起作用。我得到了网站的错误页面。所以我猜肯定有一些字段丢失了,但我已经检查了所有内容,找不到其他字段。就这样,我再次来到这里,寻求帮助 web应用程序位于上,也可以位于上 如果有人能帮我解决这个问题,我将不胜感激,
ief2
PS:代码的某些部分是荷兰语编写的,因此这里有一些翻译:
- Gemeente=城镇/城市
- Plaats=位置
- Nummer=数字
- 基准=日期
- Dag=天
- 月份
- Jaar=年
- Uur=小时
- Aankomst=到达
- Vertrek=离开
- Berekenen=计算
PPS:下载链接显然不起作用,但是我下载它时没有问题,所以这里有一些代码片段: index.php
德丽金移动
维特里克:
格米恩特:
斯特拉特:
努默:
安科姆斯特:
格米恩特:
斯特拉特:
努默:
数据:
安科姆斯特
Vertrek
Dag:字段丢失,服务器对post数据的响应非常奇怪。我只能自动化一页。要点击其他链接,cookies显然是不够的
我已经编写了一些代码,这些代码可能对需要了解表单布局的其他人有用:
HTMLFormExtractor.py
#!/usr/bin/python
import sys
import getopt
import urllib
import re
# ############################
# This code may be used by anyone. It may be used in both free
# and commercial software. It may be copied, modified and even
# be sold. The creator of this code takes no responsibility for
# any damage this script could do.
# ############################
# ############################
# ############################
# Usage: ./exec [-x] [URL]
#
# This application logs all forms of an HTML document and it's
# objects which have the HTML 'name'-attribute set. The program
# currently only works when the attributes of the objects are
# styled like the XML format (eg: name="myname").
#
# Options:
# -x: Create an XML document of the following form:
# ==== BEGIN XML ====
# formlist
# form (variable)
# attribute (variable)
# name
# value
#
# object (variable)
# type (eg: input)
# name (eg: username)
# ==== END XML ====
#
# URL: a URL pointing to an available, HTML file. If it's not
# specified specified the program will read the HTML document
# from the standard input.
#
# ############################
# ===== DATA =====
global FORM_OBJECTS_TAG_NAME
FORM_OBJECTS_TAG_NAME = ("input",
"textarea",
"label",
"fieldset",
"legend",
"select",
"optgroup",
"option",
"button")
# ===== CLASSES =====
class HTMLAttribute:
def __init__(self, name, value, orString = None):
self.name = name
self.value = value
self.originalString = None
@classmethod
def withAttributeString(cls, string):
"""Takes a string of the form attrNam="value" """
attrNameRegex = "\w+="
attrName = re.findall(attrNameRegex, string)[0]
attrName = attrName[0:len(attrName)-1]
valueRegex = "[\"'].*?[\"']"
value = re.findall(valueRegex, string)[0]
value = value[1:len(value)-1]
return cls(attrName, value, string)
class HTMLObject:
def __init__(self, aName):
self.name = aName
self.attributes = [] # contains HTMLAttribute
def addAttribute(self, anAttribute):
self.attributes.append(anAttribute)
def getAttributeWithName(self, aName):
"""Returns none or an HTLMAttribute"""
aName = aName.lower()
for anAttribute in self.attributes:
if anAttribute.name.lower() == aName: return anAttribute
return None
@classmethod
def withTagString(cls, string):
"""Takes a string of the form <aTagName attrName="value" ... >"""
tagOnyRegex = "<.*?>"
regObj = re.compile(tagOnyRegex, re.S)
string = re.findall(regObj, string)[0]
tagNameRegex = "(?<=<)\w+[\s>]"
tagName = re.findall(tagNameRegex, string)[0]
tagName = tagName[0:len(tagName)-1]
attrRegex = "\w+=[\"'].*?[\"']"
allAttributes = re.findall(attrRegex, string)
myObj = cls(tagName)
for anAttrString in allAttributes:
attrObj = HTMLAttribute.withAttributeString(anAttrString)
myObj.addAttribute(attrObj)
return myObj
class HTMLForm:
def __init__(self, name, htmlObjects):
self.name = name
self.HTMLObjects = htmlObjects # list of HTMLObject
# ===== FUNCTIONS =====
def getFormsFromHTML(htmlData):
regex = re.compile("<form.*?>.*?</form>", re.IGNORECASE | re.S)
result = re.findall(regex, htmlData)
return result
def getFormObjects(aForm):
"""Returns a list of HTMLObjects"""
global FORM_OBJECTS_TAG_NAME
myRegex = "<(?:"
myOrRegexLen = len(myRegex)
for aTagName in FORM_OBJECTS_TAG_NAME:
myRegex += aTagName + "|"
if len(myRegex) == myOrRegexLen: return []
myRegex = myRegex[0:len(myRegex)-1]
myRegex += ").*?>"
regObj = re.compile(myRegex, re.S | re.I)
allObjects = re.findall(regObj, aForm)
foundObjects = []
for anObject in allObjects:
anObj = HTMLObject.withTagString(anObject)
foundObjects.append(anObj)
return foundObjects
def printForms(foundForms, foundObjects):
"""Pass on a list of HTMLObject and a list of lists of HTMLObjects
The first list are the forms the second are the objects contained by
the forms at the corresponding index of the first list."""
counter = 0
for aForm in foundForms:
print "===== FORM " + str(counter+1) + " ====="
print "\tATTRIBUTES:"
for anAttribute in aForm.attributes:
print "\t\t" + anAttribute.name + ": '" + anAttribute.value + "'"
print "\n\t" + str(len(foundObjects)) + " OBJECTS:"
for anObject in foundObjects[counter]:
nameAttribute = anObject.getAttributeWithName("name")
if nameAttribute != None:
print "\t\t" + anObject.name + " (name=\"" + nameAttribute.value + "\")"
print "\n"
counter += 1
def createXMLString(foundForms, foundObjects):
"""Pass on a list of HTMLObject and a list of lists of HTMLObjects
The first list are the forms the second are the objects contained by
the forms at the corresponding index of the first list.
XML:
formlist
form (mult)
attribute (mult)
name
value
object (mult)
type (eg: input)
name (eg: username)
"""
counter = 0
xmlString = "<formlist>\n"
for aForm in foundForms:
# make form child
formXMLChild = "\t<form>\n"
# add all attributes
for anAttr in aForm.attributes:
formXMLChild += "\t\t<attribute>\n"
formXMLChild += "\t\t\t<name>" + anAttr.name + "</name>\n"
formXMLChild += "\t\t\t<value>" + anAttr.value + "</value>\n"
formXMLChild += "\t\t</attribute>\n"
# add all input objects if they have a name
for anObject in foundObjects[counter]:
nameAttr = anObject.getAttributeWithName("name")
if nameAttr != None:
formXMLChild += "\t\t<object>\n"
formXMLChild += "\t\t\t<type>" + anObject.name + "</type>\n"
formXMLChild += "\t\t\t<name>" + nameAttr.value + "</name>\n"
formXMLChild += "\t\t</object>\n"
# end child and append
formXMLChild += "\t<form>\n\n"
xmlString += formXMLChild
counter += 1
# end xml and return the string
xmlString = xmlString[0:len(xmlString)-1] + "</formlist>\n"
return xmlString
# ===== MAIN =====
# Parse the command line options
userArgv = sys.argv[1:]
flags, arguments = getopt.getopt(userArgv, "x")
wantsXMLFormat = flags.count(('-x', '')) > 0
hasURL = len(arguments) > 0;
# Get the HTML data
myHTML = None;
if hasURL:
myURL = arguments[0];
urlHandle = urllib.urlopen(myURL)
if urlHandle == None:
print "Failed to open the URL"
sys.exit(1)
myHTML = urlHandle.read()
urlHandle.close()
else:
myHTML = sys.stdin.read()
# Get all forms
htmlForms = getFormsFromHTML(myHTML)
# Loop with all forms
foundForms = []
foundObjects = [] # list of list
for aFormTag in htmlForms:
# append the form
formChilds = getFormObjects(aFormTag)
formHTMLObject = HTMLObject.withTagString(aFormTag)
foundForms.append(formHTMLObject)
# append a form input object
allObjects = getFormObjects(aFormTag)
foundObjects.append(allObjects)
# Print or create xml
if not wantsXMLFormat:
printForms(foundForms, foundObjects)
else:
myXMLString = createXMLString(foundForms, foundObjects)
print myXMLString
#/usr/bin/python
导入系统
导入getopt
导入URL库
进口稀土
# ############################
#任何人都可以使用此代码。它可以在两种情况下免费使用
#和商业软件。它可以被复制、修改甚至修改
#被卖掉。此代码的创建者对以下内容不承担任何责任:
#此脚本可能造成的任何损坏。
# ############################
# ############################
# ############################
#用法:./exec[-x][URL]
#
#此应用程序记录HTML文档的所有形式,并且
#具有HTML“name”属性集的对象。节目
#当前仅当对象的属性为
#样式类似于XML格式(例如:name=“myname”)。
#
#选项:
#-x:创建以下形式的XML文档:
#==开始XML====
#表单列表
#形式(变量)
#属性(变量)
#名字
#价值观
#
#对象(变量)
#类型(例如:输入)
#名称(例如:用户名)
#==结束XML====
#
#URL:指向可用HTML文件的URL。如果不是
#指定程序将读取HTML文档
#从标准输入。
#
# ############################
#====数据=====
全局窗体\u对象\u标记\u名称
表单\对象\标签\名称=(“输入”,
“文本区域”,
“标签”,
“字段集”,
“传奇”,
“选择”,
“optgroup”,
“选择权”,
“按钮”)
#=======课程=====
类HTMLAttribute:
定义初始化(self、name、value或string=None):
self.name=名称
自我价值=价值
self.originalString=无
@类方法
def withAttributeString(cls,字符串):
“”采用attrNam=“value”形式的字符串
attrNameRegex=“\w+=”
attrName=re.findall(attrNameRegex,字符串)[0]
attrName=attrName[0:len(attrName)-1]
valueRegex=“[\”].*?[\”']
value=re.findall(valueRegex,string)[0]
值=值[1:len(值)-1]
返回cls(属性名、值、字符串)
类HTMLObject:
定义初始化(self,aName):
self.name=aName
self.attributes=[]包含HTMLAttribute
def addAttribute(自身、属性):
self.attributes.append(anaAttribute)
def getAttributeWithName(self,aName):
“”“返回none或HTLMAttribute”“”
aName=aName.lower()
对于self.attributes中的anAttribute:
如果anAttribute.name.lower()==aName:返回anAttribute
一无所获
@类方法
def withTagString(cls,字符串):
“”“采用以下格式的字符串”“”
tagOnyRegex=“”
regObj=re.compile(tagOnyRegex,re.S)
string=re.findall(regObj,string)[0]
tagNameRegex=“(?document.forms[1].elements.length
说有14个,但你只列出了11个。还有一些网站拒绝在没有cookie的情况下运行(一个跟踪和两个javascript测试)。查看Firebug网络流量。我确实忘记了13个中的一个(有一个收音机),但你看到的是Route.php
,这是结果解析器,它有一个11列的表。但无论如何,我检查了cookies,我认为你是对的。当我禁用cookies时,网站上说“Uw session is verlopen”(=“你的会话已过期”)。现在是否可以手工制作这样的cookie?或者是否有其他方法来解决此问题?没有查看您的代码。下载链接无效。最好在此处粘贴摘录。--cURL允许以某种方式设置cookie,请查看各种CURLOPT_cookie*
标志。我认为这是此类任务最常见的问题。
<DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>De Lijn Mobile - Berekeningen</title>
</head>
<body>
<?php
require_once("./Plaats.php");
require_once("./Date.php");
require_once("./DeLijn.php");
echo "Gathering data...<br>";
$gemeente = $_POST["vertrekGemeente"];
$straat = $_POST["vertrekStraat"];
$nummer = $_POST["vertrekNummer"];
$vertrekPlaats = new Plaats($gemeente, $straat, $nummer);
$gemeente = $_POST["aankomstGemeente"];
$straat = $_POST["aankomstStraat"];
$nummer = $_POST["aankomstNummer"];
$aankomstPlaats = new Plaats($gemeente, $straat, $nummer);
$datumType = $_POST["datumType"];
$dag = $_POST["datumDag"];
$maand = $_POST["datumMaand"];
$jaar = $_POST["datumJaar"];
$uur = $_POST["datumUur"];
$min = $_POST["datumMinuten"];
$datum = Date::withDate($jaar, $maand, $dag, $uur, $min);
$datum->month = $maand;
echo "Searching...<br>";
searchDeLijn($vertrekPlaats,
$aankomstPlaats,
$datumType,
$datum);
?>
</body>
</html>
<?php
require_once("Route.php");
require_once("Date.php");
require_once("Plaats.php");
// ==== Returns of Route objects or null
define('DATE_ARRIVAL', "aankomst");
define('DATE_DEPARTURE', "vertrek");
function searchDeLijn($dep, $ar, $dateType, $date) {
$vertrekkenOfAankomen = "aankomen";
if(DATE_DEPARTURE === $dateType) {
$vertrekkenOfAankomen = "vertrekken";
}
$myMins = (int)$date->minutes;
$myMins -= ($myMins % 5);
$postFields = array(
"form1:vertrekGemeenteInput" => $dep->gemeente,
"form1:vertrekStraatInput" => $dep->straat,
"form1:vertrekNrInput" => $dep->nummer,
"form1:aankomstGemeenteInput" => $ar->gemeente,
"form1:aankomstStraatInput" => $ar->straat,
"form1:aankomstNrInput" => $ar->nummer,
"form1:vertrekkenOfAankomenRadio" => $vertrekkenOfAankomen,
"form1:dagCombo" => (string)(int)$date->day,
"form1:maandCombo" => (string)(int)$date->month,
"form1:jaarCombo" => $date->year,
"form1:uurCombo" => (string)(int)$date->hour,
"form1:minutenCombo" => (string)$myMins);
print_r($postFields);
// do the curl
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,
'http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=nl');
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$contents = curl_exec($ch);
curl_close($ch);
if($contents == false) {
return null;
}
echo $contents;
$myRouteObjects = extractRoutesFromXMLData($contents);
return $myRouteObjects;
}
// ==== Returns array of Route objects or null
function extractRoutesFromXMLData($dataString) {
$tableBody = getResultsTableBody($dataString);
if($tableBody != null) { return null; }
$tableRows = getTableRowsOfTableBody($tableBody);
if($tableRows != null) { return null; }
// put them in an array
$myArray = array();
$count = $tableRows->length;
for($i = 0; $i < $count; $i++) {
$aNode = $tableRows->item(0);
$myArray[] = $aNode;
}
return $myArray;
}
// ==== Returns XMLDocument or null
function getResultsTableBody($dataString) {
// Get table element
$status = preg_match('/<TABLE id="routeplanner_overzicht".*?>.*?<\/TABLE>/is',
$docString, $matches);
if($status == 0) {
return null;
}
$tableElement = $matches[0];
// Extract body
$status = preg_match('/<TBODY>.*?<\/TBODY>/is',
$tableElement, $matches);
if($status == 0) {
return null;
}
$doc = new DOMDocument();
$doc->loadXML($matches[0]);
return $doc;
}
// ==== Retunrs XMLNodeList or null
function getTableRowsOfTableBody($xmlDoc) {
$xpath = new DOMXPath($domDoc);
$xpathres = $xpath->evaluate("//tbody[0]/tr");
if($xpathres == false) {
return null;
}
return $xpathres;
}
?>
#!/usr/bin/python
import sys
import getopt
import urllib
import re
# ############################
# This code may be used by anyone. It may be used in both free
# and commercial software. It may be copied, modified and even
# be sold. The creator of this code takes no responsibility for
# any damage this script could do.
# ############################
# ############################
# ############################
# Usage: ./exec [-x] [URL]
#
# This application logs all forms of an HTML document and it's
# objects which have the HTML 'name'-attribute set. The program
# currently only works when the attributes of the objects are
# styled like the XML format (eg: name="myname").
#
# Options:
# -x: Create an XML document of the following form:
# ==== BEGIN XML ====
# formlist
# form (variable)
# attribute (variable)
# name
# value
#
# object (variable)
# type (eg: input)
# name (eg: username)
# ==== END XML ====
#
# URL: a URL pointing to an available, HTML file. If it's not
# specified specified the program will read the HTML document
# from the standard input.
#
# ############################
# ===== DATA =====
global FORM_OBJECTS_TAG_NAME
FORM_OBJECTS_TAG_NAME = ("input",
"textarea",
"label",
"fieldset",
"legend",
"select",
"optgroup",
"option",
"button")
# ===== CLASSES =====
class HTMLAttribute:
def __init__(self, name, value, orString = None):
self.name = name
self.value = value
self.originalString = None
@classmethod
def withAttributeString(cls, string):
"""Takes a string of the form attrNam="value" """
attrNameRegex = "\w+="
attrName = re.findall(attrNameRegex, string)[0]
attrName = attrName[0:len(attrName)-1]
valueRegex = "[\"'].*?[\"']"
value = re.findall(valueRegex, string)[0]
value = value[1:len(value)-1]
return cls(attrName, value, string)
class HTMLObject:
def __init__(self, aName):
self.name = aName
self.attributes = [] # contains HTMLAttribute
def addAttribute(self, anAttribute):
self.attributes.append(anAttribute)
def getAttributeWithName(self, aName):
"""Returns none or an HTLMAttribute"""
aName = aName.lower()
for anAttribute in self.attributes:
if anAttribute.name.lower() == aName: return anAttribute
return None
@classmethod
def withTagString(cls, string):
"""Takes a string of the form <aTagName attrName="value" ... >"""
tagOnyRegex = "<.*?>"
regObj = re.compile(tagOnyRegex, re.S)
string = re.findall(regObj, string)[0]
tagNameRegex = "(?<=<)\w+[\s>]"
tagName = re.findall(tagNameRegex, string)[0]
tagName = tagName[0:len(tagName)-1]
attrRegex = "\w+=[\"'].*?[\"']"
allAttributes = re.findall(attrRegex, string)
myObj = cls(tagName)
for anAttrString in allAttributes:
attrObj = HTMLAttribute.withAttributeString(anAttrString)
myObj.addAttribute(attrObj)
return myObj
class HTMLForm:
def __init__(self, name, htmlObjects):
self.name = name
self.HTMLObjects = htmlObjects # list of HTMLObject
# ===== FUNCTIONS =====
def getFormsFromHTML(htmlData):
regex = re.compile("<form.*?>.*?</form>", re.IGNORECASE | re.S)
result = re.findall(regex, htmlData)
return result
def getFormObjects(aForm):
"""Returns a list of HTMLObjects"""
global FORM_OBJECTS_TAG_NAME
myRegex = "<(?:"
myOrRegexLen = len(myRegex)
for aTagName in FORM_OBJECTS_TAG_NAME:
myRegex += aTagName + "|"
if len(myRegex) == myOrRegexLen: return []
myRegex = myRegex[0:len(myRegex)-1]
myRegex += ").*?>"
regObj = re.compile(myRegex, re.S | re.I)
allObjects = re.findall(regObj, aForm)
foundObjects = []
for anObject in allObjects:
anObj = HTMLObject.withTagString(anObject)
foundObjects.append(anObj)
return foundObjects
def printForms(foundForms, foundObjects):
"""Pass on a list of HTMLObject and a list of lists of HTMLObjects
The first list are the forms the second are the objects contained by
the forms at the corresponding index of the first list."""
counter = 0
for aForm in foundForms:
print "===== FORM " + str(counter+1) + " ====="
print "\tATTRIBUTES:"
for anAttribute in aForm.attributes:
print "\t\t" + anAttribute.name + ": '" + anAttribute.value + "'"
print "\n\t" + str(len(foundObjects)) + " OBJECTS:"
for anObject in foundObjects[counter]:
nameAttribute = anObject.getAttributeWithName("name")
if nameAttribute != None:
print "\t\t" + anObject.name + " (name=\"" + nameAttribute.value + "\")"
print "\n"
counter += 1
def createXMLString(foundForms, foundObjects):
"""Pass on a list of HTMLObject and a list of lists of HTMLObjects
The first list are the forms the second are the objects contained by
the forms at the corresponding index of the first list.
XML:
formlist
form (mult)
attribute (mult)
name
value
object (mult)
type (eg: input)
name (eg: username)
"""
counter = 0
xmlString = "<formlist>\n"
for aForm in foundForms:
# make form child
formXMLChild = "\t<form>\n"
# add all attributes
for anAttr in aForm.attributes:
formXMLChild += "\t\t<attribute>\n"
formXMLChild += "\t\t\t<name>" + anAttr.name + "</name>\n"
formXMLChild += "\t\t\t<value>" + anAttr.value + "</value>\n"
formXMLChild += "\t\t</attribute>\n"
# add all input objects if they have a name
for anObject in foundObjects[counter]:
nameAttr = anObject.getAttributeWithName("name")
if nameAttr != None:
formXMLChild += "\t\t<object>\n"
formXMLChild += "\t\t\t<type>" + anObject.name + "</type>\n"
formXMLChild += "\t\t\t<name>" + nameAttr.value + "</name>\n"
formXMLChild += "\t\t</object>\n"
# end child and append
formXMLChild += "\t<form>\n\n"
xmlString += formXMLChild
counter += 1
# end xml and return the string
xmlString = xmlString[0:len(xmlString)-1] + "</formlist>\n"
return xmlString
# ===== MAIN =====
# Parse the command line options
userArgv = sys.argv[1:]
flags, arguments = getopt.getopt(userArgv, "x")
wantsXMLFormat = flags.count(('-x', '')) > 0
hasURL = len(arguments) > 0;
# Get the HTML data
myHTML = None;
if hasURL:
myURL = arguments[0];
urlHandle = urllib.urlopen(myURL)
if urlHandle == None:
print "Failed to open the URL"
sys.exit(1)
myHTML = urlHandle.read()
urlHandle.close()
else:
myHTML = sys.stdin.read()
# Get all forms
htmlForms = getFormsFromHTML(myHTML)
# Loop with all forms
foundForms = []
foundObjects = [] # list of list
for aFormTag in htmlForms:
# append the form
formChilds = getFormObjects(aFormTag)
formHTMLObject = HTMLObject.withTagString(aFormTag)
foundForms.append(formHTMLObject)
# append a form input object
allObjects = getFormObjects(aFormTag)
foundObjects.append(allObjects)
# Print or create xml
if not wantsXMLFormat:
printForms(foundForms, foundObjects)
else:
myXMLString = createXMLString(foundForms, foundObjects)
print myXMLString