The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems'
require 'mechanize'
require 'logger'
agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
agent.user_agent_alias = 'Mac Safari'
page = agent.get("http://www.google.com/")
search_form = page.forms.name("f").first
search_form.fields.name("q").value = "Hello"
search_results = agent.submit(search_form)
puts search_results.body
| VERSION | = | '0.7.6' | The version of Mechanize you are using. | |
| AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases | |
| CNONCE | = | Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535))) |
| redirect_ok | -> | follow_redirect? |
| ca_file | [RW] | |
| cert | [RW] | |
| conditional_requests | [RW] | |
| cookie_jar | [RW] | |
| follow_meta_refresh | [RW] | |
| history | [R] | |
| history_added | [RW] | |
| keep_alive | [RW] | |
| keep_alive_time | [RW] | |
| key | [RW] | |
| log | [RW] | |
| open_timeout | [RW] | |
| pass | [RW] | |
| pluggable_parser | [R] | |
| read_timeout | [RW] | |
| redirect_ok | [RW] | |
| scheme_handlers | [RW] | |
| user_agent | [RW] | |
| verify_callback | [RW] | |
| watch_for_set | [RW] |
# File lib/www/mechanize.rb, line 310
310: def html_unescape(s)
311: return s unless s
312: s.gsub(/&(\w+|#[0-9]+);/) { |match|
313: number = case match
314: when /&(\w+);/
315: Hpricot::NamedCharacters[$1]
316: when /&#([0-9]+);/
317: $1.to_i
318: end
319:
320: number ? ([number].pack('U') rescue match) : match
321: }
322: end
# File lib/www/mechanize.rb, line 85
85: def initialize
86: # attr_accessors
87: @cookie_jar = CookieJar.new
88: @log = nil
89: @open_timeout = nil
90: @read_timeout = nil
91: @user_agent = AGENT_ALIASES['Mechanize']
92: @watch_for_set = nil
93: @history_added = nil
94: @ca_file = nil # OpenSSL server certificate file
95:
96: # callback for OpenSSL errors while verifying the server certificate
97: # chain, can be used for debugging or to ignore errors by always
98: # returning _true_
99: @verify_callback = nil
100: @cert = nil # OpenSSL Certificate
101: @key = nil # OpenSSL Private Key
102: @pass = nil # OpenSSL Password
103: @redirect_ok = true # Should we follow redirects?
104:
105: # attr_readers
106: @history = WWW::Mechanize::History.new
107: @pluggable_parser = PluggableParser.new
108:
109: # Auth variables
110: @user = nil # Auth User
111: @password = nil # Auth Password
112: @digest = nil # DigestAuth Digest
113: @auth_hash = {} # Keep track of urls for sending auth
114: @digest_response = nil
115:
116: # Proxy settings
117: @proxy_addr = nil
118: @proxy_pass = nil
119: @proxy_port = nil
120: @proxy_user = nil
121:
122: @conditional_requests = true
123:
124: @follow_meta_refresh = false
125:
126: # Connection Cache & Keep alive
127: @connection_cache = {}
128: @keep_alive_time = 300
129: @keep_alive = true
130:
131: @scheme_handlers = Hash.new { |h,k|
132: h[k] = lambda { |link, page|
133: raise UnsupportedSchemeError.new(k)
134: }
135: }
136: @scheme_handlers['http'] = lambda { |link, page| link }
137: @scheme_handlers['https'] = @scheme_handlers['http']
138: @scheme_handlers['relative'] = @scheme_handlers['http']
139:
140: yield self if block_given?
141: end
# File lib/www/mechanize.rb, line 683
683: def self.build_query_string(parameters)
684: parameters.map { |k,v|
685: k &&
686: [WEBrick::HTTPUtils.escape_form(k.to_s),
687: WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
688: }.compact.join('&')
689: end
# File lib/www/mechanize.rb, line 167
167: def auth(user, password)
168: @user = user
169: @password = password
170: end
Sets the user and password to be used for basic authentication.
# File lib/www/mechanize.rb, line 163
163: def basic_auth(user, password)
164: auth(user, password)
165: end
Clicks the WWW::Mechanize::Link object passed in and returns the page fetched.
# File lib/www/mechanize.rb, line 213
213: def click(link)
214: referer =
215: begin
216: link.page
217: rescue
218: nil
219: end
220: uri = to_absolute_uri(
221: link.attributes['href'] || link.attributes['src'] || link.href,
222: referer || current_page()
223: )
224: get(uri, referer)
225: end
Fetches the URL passed in and returns a page.
# File lib/www/mechanize.rb, line 173
173: def get(url, parameters = [], referer = nil)
174: unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0
175: referer = parameters
176: parameters = []
177: end
178:
179: referer ||= current_page || Page.new(nil, {'content-type'=>'text/html'})
180:
181: # FIXME: Huge hack so that using a URI as a referer works. I need to
182: # refactor everything to pass around URIs but still support
183: # WWW::Mechanize::Page#base
184: unless referer.is_a?(WWW::Mechanize::File)
185: referer = referer.is_a?(String) ?
186: Page.new(URI.parse(referer), {'content-type' => 'text/html'}) :
187: Page.new(referer, {'content-type' => 'text/html'})
188: end
189: abs_uri = to_absolute_uri(url, referer)
190:
191: if parameters.length > 0
192: abs_uri.query ||= ''
193: abs_uri.query << '&' if abs_uri.query.length > 0
194: abs_uri.query << self.class.build_query_string(parameters)
195: end
196:
197: # fetch the page
198: request = fetch_request(abs_uri)
199: page = fetch_page(abs_uri, request, referer)
200: add_to_history(page)
201: yield page if block_given?
202: page
203: end
Fetch a file and return the contents of the file.
# File lib/www/mechanize.rb, line 206
206: def get_file(url)
207: get(url).body
208: end
# File lib/www/mechanize.rb, line 143
143: def max_history=(length); @history.max_size = length; end
Posts to the given URL wht the query parameters passed in. Query parameters can be passed as a hash, or as an array of arrays. Example:
agent.post('http://example.com/', "foo" => "bar")
or
agent.post('http://example.com/', [ ["foo", "bar"] ])
# File lib/www/mechanize.rb, line 239
239: def post(url, query={})
240: node = Hpricot::Elem.new(Hpricot::STag.new('form'))
241: node['method'] = 'POST'
242: node['enctype'] = 'application/x-www-form-urlencoded'
243:
244: form = Form.new(node)
245: query.each { |k,v|
246: if v.is_a?(IO)
247: form.enctype = 'multipart/form-data'
248: ul = Form::FileUpload.new(k.to_s,::File.basename(v.path))
249: ul.file_data = v.read
250: form.file_uploads << ul
251: else
252: form.fields << Form::Field.new(k.to_s,v)
253: end
254: }
255: post_form(url, form)
256: end
Sets the proxy address, port, user, and password
# File lib/www/mechanize.rb, line 147
147: def set_proxy(addr, port, user = nil, pass = nil)
148: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
149: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com')
agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/www/mechanize.rb, line 264
264: def submit(form, button=nil)
265: form.add_button_to_query(button) if button
266: uri = to_absolute_uri(form.action, form.page)
267: case form.method.upcase
268: when 'POST'
269: post_form(uri, form)
270: when 'GET'
271: uri.query = WWW::Mechanize.build_query_string(form.build_query)
272: get(uri)
273: else
274: raise "unsupported method: #{form.method.upcase}"
275: end
276: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/www/mechanize.rb, line 298
298: def transact
299: history_backup = @history.dup
300: begin
301: yield self
302: ensure
303: @history = history_backup
304: end
305: end
Returns whether or not a url has been visited
# File lib/www/mechanize.rb, line 284
284: def visited?(url)
285: ! visited_page(url).nil?
286: end
# File lib/www/mechanize.rb, line 376
376: def gen_auth_header(uri, request, auth_header, is_IIS = false)
377: @@nonce_count += 1
378:
379: user = @digest_user
380: password = @digest_password
381:
382: auth_header =~ /^(\w+) (.*)/
383:
384: params = {}
385: $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
386:
387: a_1 = "#{@user}:#{params['realm']}:#{@password}"
388: a_2 = "#{request.method}:#{uri.path}"
389: request_digest = ''
390: request_digest << Digest::MD5.hexdigest(a_1)
391: request_digest << ':' << params['nonce']
392: request_digest << ':' << ('%08x' % @@nonce_count)
393: request_digest << ':' << CNONCE
394: request_digest << ':' << params['qop']
395: request_digest << ':' << Digest::MD5.hexdigest(a_2)
396:
397: header = ''
398: header << "Digest username=\"#{@user}\", "
399: header << "realm=\"#{params['realm']}\", "
400: if is_IIS then
401: header << "qop=\"#{params['qop']}\", "
402: else
403: header << "qop=#{params['qop']}, "
404: end
405: header << "uri=\"#{uri.path}\", "
406: header << "algorithm=MD5, "
407: header << "nonce=\"#{params['nonce']}\", "
408: header << "nc=#{'%08x' % @@nonce_count}, "
409: header << "cnonce=\"#{CNONCE}\", "
410: header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
411:
412: return header
413: end
# File lib/www/mechanize.rb, line 326
326: def set_headers(uri, request, cur_page)
327: if @keep_alive
328: request.add_field('Connection', 'keep-alive')
329: request.add_field('Keep-Alive', keep_alive_time.to_s)
330: else
331: request.add_field('Connection', 'close')
332: end
333: request.add_field('Accept-Encoding', 'gzip,identity')
334: request.add_field('Accept-Language', 'en-us,en;q=0.5')
335: request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
336:
337: unless @cookie_jar.empty?(uri)
338: cookies = @cookie_jar.cookies(uri)
339: cookie = cookies.length > 0 ? cookies.join("; ") : nil
340: if log
341: cookies.each do |c|
342: log.debug("using cookie: #{c}")
343: end
344: end
345: request.add_field('Cookie', cookie)
346: end
347:
348: # Add Referer header to request
349: unless cur_page.uri.nil?
350: request.add_field('Referer', cur_page.uri.to_s)
351: end
352:
353: # Add User-Agent header to request
354: request.add_field('User-Agent', @user_agent) if @user_agent
355:
356: # Add If-Modified-Since if page is in history
357: if @conditional_requests
358: if( (page = visited_page(uri)) && page.response['Last-Modified'] )
359: request.add_field('If-Modified-Since', page.response['Last-Modified'])
360: end
361: end
362:
363: if( @auth_hash[uri.host] )
364: case @auth_hash[uri.host]
365: when :basic
366: request.basic_auth(@user, @password)
367: when :digest
368: @digest_response = self.gen_auth_header(uri,request,@digest) if @digest
369: request.add_field('Authorization', @digest_response) if @digest_response
370: end
371: end
372:
373: request
374: end
# File lib/www/mechanize.rb, line 691
691: def add_to_history(page)
692: @history.push(page, to_absolute_uri(page.uri))
693: history_added.call(page) if history_added
694: end
uri is an absolute URI
# File lib/www/mechanize.rb, line 482
482: def fetch_page(uri, request, cur_page=current_page(), request_data=[])
483: raise "unsupported scheme: #{uri.scheme}" unless ['http', 'https'].include?(uri.scheme.downcase)
484:
485: log.info("#{ request.class }: #{ request.path }") if log
486:
487: page = nil
488:
489: cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
490: :connection => nil,
491: :keep_alive_options => {},
492: })
493: http_obj = cache_obj[:connection]
494: if http_obj.nil? || ! http_obj.started?
495: http_obj = cache_obj[:connection] =
496: Net::HTTP.new( uri.host,
497: uri.port,
498: @proxy_addr,
499: @proxy_port,
500: @proxy_user,
501: @proxy_pass
502: )
503: cache_obj[:keep_alive_options] = {}
504:
505: # Specify timeouts if given
506: http_obj.open_timeout = @open_timeout if @open_timeout
507: http_obj.read_timeout = @read_timeout if @read_timeout
508: end
509:
510: if uri.scheme == 'https' && ! http_obj.started?
511: http_obj.use_ssl = true
512: http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
513: if @ca_file
514: http_obj.ca_file = @ca_file
515: http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
516: http_obj.verify_callback = @verify_callback if @verify_callback
517: end
518: if @cert && @key
519: http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
520: http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
521: end
522: end
523:
524: # If we're keeping connections alive and the last request time is too
525: # long ago, stop the connection. Or, if the max requests left is 1,
526: # reset the connection.
527: if @keep_alive && http_obj.started?
528: opts = cache_obj[:keep_alive_options]
529: if((opts[:timeout] &&
530: Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
531: opts[:max] && opts[:max].to_i == 1)
532:
533: log.debug('Finishing stale connection') if log
534: http_obj.finish
535:
536: end
537: end
538:
539: http_obj.start unless http_obj.started?
540:
541: request = set_headers(uri, request, cur_page)
542:
543: # Log specified headers for the request
544: if log
545: request.each_header do |k, v|
546: log.debug("request-header: #{ k } => #{ v }")
547: end
548: end
549:
550: cache_obj[:last_request_time] = Time.now.to_i
551:
552: # Send the request
553: begin
554: response = http_obj.request(request, *request_data) {|response|
555:
556: body = StringIO.new
557: total = 0
558: response.read_body { |part|
559: total += part.length
560: body.write(part)
561: log.debug("Read #{total} bytes") if log
562: }
563: # Net::HTTP ignores EOFError if Content-length is given, so we emulate it here.
564: raise EOFError if response.content_length() && response.content_length() != total
565: body.rewind
566:
567: response.each_header { |k,v|
568: log.debug("response-header: #{ k } => #{ v }")
569: } if log
570:
571: content_type = nil
572: unless response['Content-Type'].nil?
573: data = response['Content-Type'].match(/^([^;]*)/)
574: content_type = data[1].downcase unless data.nil?
575: end
576:
577: response_body =
578: if encoding = response['Content-Encoding']
579: case encoding.downcase
580: when 'gzip'
581: log.debug('gunzip body') if log
582: if response['Content-Length'].to_i > 0 || body.length > 0
583: begin
584: Zlib::GzipReader.new(body).read
585: rescue Zlib::BufError => e
586: log.error('Caught a Zlib::BufError') if log
587: body.rewind
588: body.read(10)
589: Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body.read)
590: end
591: else
592: ''
593: end
594: when 'x-gzip'
595: body.read
596: else
597: raise 'Unsupported content encoding'
598: end
599: else
600: body.read
601: end
602:
603: # Find our pluggable parser
604: page = @pluggable_parser.parser(content_type).new(
605: uri,
606: response,
607: response_body,
608: response.code
609: ) { |parser|
610: parser.mech = self if parser.respond_to? :mech=
611: if parser.respond_to?(:watch_for_set=) && @watch_for_set
612: parser.watch_for_set = @watch_for_set
613: end
614: }
615:
616: }
617: rescue EOFError, Errno::ECONNRESET, Errno::EPIPE
618: log.error("Rescuing EOF error") if log
619: http_obj.finish
620: request.body = nil
621: http_obj.start
622: retry
623: end
624:
625: # If the server sends back keep alive options, save them
626: if keep_alive_info = response['keep-alive']
627: keep_alive_info.split(/,\s*/).each do |option|
628: k, v = option.split(/=/)
629: cache_obj[:keep_alive_options] ||= {}
630: cache_obj[:keep_alive_options][k.intern] = v
631: end
632: end
633:
634: (response.get_fields('Set-Cookie')||[]).each do |cookie|
635: Cookie::parse(uri, cookie, log) { |c|
636: log.debug("saved cookie: #{c}") if log
637: @cookie_jar.add(uri, c)
638: }
639: end
640:
641: log.info("status: #{ page.code }") if log
642:
643: res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
644:
645: if follow_meta_refresh && page.respond_to?(:meta) &&
646: (redirect = page.meta.first)
647: return redirect.click
648: end
649:
650: return page if res_klass <= Net::HTTPSuccess
651:
652: if res_klass == Net::HTTPNotModified
653: log.debug("Got cached page") if log
654: return visited_page(uri)
655: elsif res_klass <= Net::HTTPRedirection
656: return page unless follow_redirect?
657: log.info("follow redirect to: #{ response['Location'] }") if log
658: from_uri = page.uri
659: abs_uri = to_absolute_uri(response['Location'].to_s, page)
660: page = fetch_page(abs_uri, fetch_request(abs_uri), page)
661: @history.push(page, from_uri)
662: return page
663: elsif res_klass <= Net::HTTPUnauthorized
664: raise ResponseCodeError.new(page) unless @user || @password
665: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
666: if response['www-authenticate'] =~ /Digest/i
667: @auth_hash[uri.host] = :digest
668: @digest = response['www-authenticate']
669: else
670: @auth_hash[uri.host] = :basic
671: end
672: # Copy the request headers for the second attempt
673: req = fetch_request(uri, request.method.downcase.to_sym)
674: request.each_header do |k,v|
675: req[k] = v
676: end
677: return fetch_page(uri, req, cur_page, request_data)
678: end
679:
680: raise ResponseCodeError.new(page), "Unhandled response", caller
681: end
Creates a new request object based on the scheme and type
# File lib/www/mechanize.rb, line 472
472: def fetch_request(uri, type = :get)
473: raise "unsupported scheme: #{uri.scheme}" unless ['http', 'https'].include?(uri.scheme.downcase)
474: if type == :get
475: Net::HTTP::Get.new(uri.request_uri)
476: else
477: Net::HTTP::Post.new(uri.request_uri)
478: end
479: end
# File lib/www/mechanize.rb, line 452
452: def post_form(url, form)
453: cur_page = form.page || current_page ||
454: Page.new( nil, {'content-type'=>'text/html'})
455:
456: request_data = form.request_data
457:
458: abs_url = to_absolute_uri(url, cur_page)
459: request = fetch_request(abs_url, :post)
460: request.add_field('Content-Type', form.enctype)
461: request.add_field('Content-Length', request_data.size.to_s)
462:
463: log.debug("query: #{ request_data.inspect }") if log
464:
465: # fetch the page
466: page = fetch_page(abs_url, request, cur_page, [request_data])
467: add_to_history(page)
468: page
469: end
# File lib/www/mechanize.rb, line 417
417: def to_absolute_uri(url, cur_page=current_page())
418: unless url.is_a? URI
419: url = url.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/) { |match|
420: sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
421: }
422:
423: url = URI.parse(
424: Mechanize.html_unescape(
425: url.split(/%[0-9A-Fa-f]{2}|#/).zip(
426: url.scan(/%[0-9A-Fa-f]{2}|#/)
427: ).map { |x,y|
428: "#{URI.escape(x)}#{y}"
429: }.join('')
430: )
431: )
432: end
433:
434: url = @scheme_handlers[url.relative? ? 'relative' : url.scheme.downcase].call(url, cur_page)
435: url.path = '/' if url.path.length == 0
436:
437: # construct an absolute uri
438: if url.relative?
439: raise 'no history. please specify an absolute URL' unless cur_page.uri
440: base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
441: url = ((base && base.uri && base.uri.absolute?) ?
442: base.uri :
443: cur_page.uri) + url
444: url = cur_page.uri + url
445: # Strip initial "/.." bits from the path
446: url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
447: end
448:
449: return url
450: end