The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems'
require 'mechanize'
require 'logger'
agent = Mechanize.new { |a| a.log = Logger.new("mech.log") }
agent.user_agent_alias = 'Mac Safari'
page = agent.get("http://www.google.com/")
search_form = page.form_with(:name => "f")
search_form.field_with(:name => "q").value = "Hello"
search_results = agent.submit(search_form)
puts search_results.body
| VERSION | = | '1.0.0' | The version of Mechanize you are using. | |
| AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases |
| redirect_ok | -> | follow_redirect? |
| ca_file | [RW] | |
| cert | [RW] | |
| conditional_requests | [RW] | |
| cookie_jar | [RW] | |
| follow_meta_refresh | [RW] | |
| gzip_enabled | [RW] | |
| history | [R] | |
| history_added | [RW] | |
| html_parser | [RW] | The HTML parser to be used when parsing documents |
| html_parser | [RW] | |
| keep_alive | [RW] | |
| keep_alive_time | [RW] | |
| key | [RW] | |
| log | [RW] | |
| open_timeout | [RW] | |
| pass | [RW] | |
| pluggable_parser | [R] | |
| proxy_addr | [R] | Proxy settings |
| proxy_pass | [R] | |
| proxy_port | [R] | |
| proxy_user | [R] | |
| read_timeout | [RW] | |
| redirect_ok | [RW] | |
| redirection_limit | [RW] | |
| request_headers | [RW] | A hash of custom request headers |
| scheme_handlers | [RW] | |
| user_agent | [RW] | |
| verify_callback | [RW] | |
| watch_for_set | [RW] |
# File lib/mechanize.rb, line 109
109: def inherited(child)
110: child.html_parser ||= html_parser
111: child.log ||= log
112: super
113: end
# File lib/mechanize.rb, line 116
116: def initialize
117: # attr_accessors
118: @cookie_jar = CookieJar.new
119: @log = nil
120: @open_timeout = nil
121: @read_timeout = nil
122: @user_agent = AGENT_ALIASES['Mechanize']
123: @watch_for_set = nil
124: @history_added = nil
125: @ca_file = nil # OpenSSL server certificate file
126:
127: # callback for OpenSSL errors while verifying the server certificate
128: # chain, can be used for debugging or to ignore errors by always
129: # returning _true_
130: @verify_callback = nil
131: @cert = nil # OpenSSL Certificate
132: @key = nil # OpenSSL Private Key
133: @pass = nil # OpenSSL Password
134: @redirect_ok = true # Should we follow redirects?
135: @gzip_enabled = true
136:
137: # attr_readers
138: @history = Mechanize::History.new
139: @pluggable_parser = PluggableParser.new
140:
141: # Auth variables
142: @user = nil # Auth User
143: @password = nil # Auth Password
144: @digest = nil # DigestAuth Digest
145: @auth_hash = {} # Keep track of urls for sending auth
146: @request_headers= {} # A hash of request headers to be used
147:
148: # Proxy settings
149: @proxy_addr = nil
150: @proxy_pass = nil
151: @proxy_port = nil
152: @proxy_user = nil
153:
154: @conditional_requests = true
155:
156: @follow_meta_refresh = false
157: @redirection_limit = 20
158:
159: # Connection Cache & Keep alive
160: @connection_cache = {}
161: @keep_alive_time = 300
162: @keep_alive = true
163:
164: @scheme_handlers = Hash.new { |h,k|
165: h[k] = lambda { |link, page|
166: raise UnsupportedSchemeError.new(k)
167: }
168: }
169: @scheme_handlers['http'] = lambda { |link, page| link }
170: @scheme_handlers['https'] = @scheme_handlers['http']
171: @scheme_handlers['relative'] = @scheme_handlers['http']
172: @scheme_handlers['file'] = @scheme_handlers['http']
173:
174: @pre_connect_hook = Chain::PreConnectHook.new
175: @post_connect_hook = Chain::PostConnectHook.new
176:
177: @html_parser = self.class.html_parser
178:
179: yield self if block_given?
180: end
Sets the user and password to be used for authentication.
# File lib/mechanize.rb, line 213
213: def auth(user, password)
214: @user = user
215: @password = password
216: end
Clicks the Mechanize::Link object passed in and returns the page fetched.
# File lib/mechanize.rb, line 311
311: def click(link)
312: referer = link.page rescue referer = nil
313: href = link.respond_to?(:href) ? link.href :
314: (link['href'] || link['src'])
315: get(:url => href, :referer => (referer || current_page()))
316: end
DELETE to url with query_params, and setting options:
delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/mechanize.rb, line 280
280: def delete(url, query_params = {}, options = {})
281: page = head(url, query_params, options.merge({:verb => :delete}))
282: add_to_history(page)
283: page
284: end
Fetches the URL passed in and returns a page.
# File lib/mechanize.rb, line 220
220: def get(options, parameters = [], referer = nil)
221: verb = :get
222:
223: unless options.is_a? Hash
224: url = options
225: unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0
226: referer = parameters
227: parameters = []
228: end
229: else
230: raise ArgumentError.new("url must be specified") unless url = options[:url]
231: parameters = options[:params] || []
232: referer = options[:referer]
233: headers = options[:headers]
234: verb = options[:verb] || verb
235: end
236:
237: unless referer
238: if url.to_s =~ /^http/
239: referer = Page.new(nil, {'content-type'=>'text/html'})
240: else
241: referer = current_page || Page.new(nil, {'content-type'=>'text/html'})
242: end
243: end
244:
245: # FIXME: Huge hack so that using a URI as a referer works. I need to
246: # refactor everything to pass around URIs but still support
247: # Mechanize::Page#base
248: unless referer.is_a?(Mechanize::File)
249: referer = referer.is_a?(String) ?
250: Page.new(URI.parse(referer), {'content-type' => 'text/html'}) :
251: Page.new(referer, {'content-type' => 'text/html'})
252: end
253:
254: # fetch the page
255: page = fetch_page( :uri => url,
256: :referer => referer,
257: :headers => headers || {},
258: :verb => verb,
259: :params => parameters
260: )
261: add_to_history(page)
262: yield page if block_given?
263: page
264: end
Fetch a file and return the contents of the file.
# File lib/mechanize.rb, line 305
305: def get_file(url)
306: get(url).body
307: end
HEAD to url with query_params, and setting options:
head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/mechanize.rb, line 291
291: def head(url, query_params = {}, options = {})
292: options = {
293: :uri => url,
294: :headers => {},
295: :params => query_params,
296: :verb => :head
297: }.merge(options)
298: # fetch the page
299: page = fetch_page(options)
300: yield page if block_given?
301: page
302: end
Posts to the given URL with the request entity. The request entity is specified by either a string, or a list of key-value pairs represented by a hash or an array of arrays.
Examples:
agent.post('http://example.com/', "foo" => "bar")
agent.post('http://example.com/', [ ["foo", "bar"] ])
agent.post('http://example.com/', "<message>hello</message>", 'Content-Type' => 'application/xml')
# File lib/mechanize.rb, line 334
334: def post(url, query={}, headers={})
335: if query.is_a?(String)
336: return request_with_entity(:post, url, query, :headers => headers)
337: end
338: node = {}
339: # Create a fake form
340: class << node
341: def search(*args); []; end
342: end
343: node['method'] = 'POST'
344: node['enctype'] = 'application/x-www-form-urlencoded'
345:
346: form = Form.new(node)
347: query.each { |k,v|
348: if v.is_a?(IO)
349: form.enctype = 'multipart/form-data'
350: ul = Form::FileUpload.new({'name' => k.to_s},::File.basename(v.path))
351: ul.file_data = v.read
352: form.file_uploads << ul
353: else
354: form.fields << Form::Field.new({'name' => k.to_s},v)
355: end
356: }
357: post_form(url, form, headers)
358: end
# File lib/mechanize.rb, line 191
191: def post_connect_hooks
192: @post_connect_hook.hooks
193: end
PUT to url with entity, and setting options:
put('http://tenderlovemaking.com/', 'new content', :headers => {'Content-Type' => 'text/plain'})
# File lib/mechanize.rb, line 271
271: def put(url, entity, options = {})
272: request_with_entity(:put, url, entity, options)
273: end
# File lib/mechanize.rb, line 382
382: def request_with_entity(verb, url, entity, options={})
383: cur_page = current_page || Page.new( nil, {'content-type'=>'text/html'})
384:
385: options = {
386: :uri => url,
387: :referer => cur_page,
388: :headers => {},
389: }.update(options)
390:
391: headers = {
392: 'Content-Type' => 'application/octet-stream',
393: 'Content-Length' => entity.size.to_s,
394: }.update(options[:headers])
395:
396: options.update({
397: :verb => verb,
398: :params => [entity],
399: :headers => headers,
400: })
401:
402: page = fetch_page(options)
403: add_to_history(page)
404: page
405: end
Sets the proxy address, port, user, and password addr should be a host, with no "http://"
# File lib/mechanize.rb, line 197
197: def set_proxy(addr, port, user = nil, pass = nil)
198: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
199: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com')
agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/mechanize.rb, line 366
366: def submit(form, button=nil, headers={})
367: form.add_button_to_query(button) if button
368: case form.method.upcase
369: when 'POST'
370: post_form(form.action, form, headers)
371: when 'GET'
372: get( :url => form.action.gsub(/\?[^\?]*$/, ''),
373: :params => form.build_query,
374: :headers => headers,
375: :referer => form.page
376: )
377: else
378: raise "unsupported method: #{form.method.upcase}"
379: end
380: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/mechanize.rb, line 427
427: def transact
428: history_backup = @history.dup
429: begin
430: yield self
431: ensure
432: @history = history_backup
433: end
434: end
Returns whether or not a url has been visited
# File lib/mechanize.rb, line 413
413: def visited?(url)
414: ! visited_page(url).nil?
415: end
# File lib/mechanize.rb, line 638
638: def add_to_history(page)
639: @history.push(page, resolve(page.uri))
640: history_added.call(page) if history_added
641: end
uri is an absolute URI
# File lib/mechanize.rb, line 470
470: def fetch_page(params)
471: options = {
472: :request => nil,
473: :response => nil,
474: :connection => nil,
475: :referer => current_page(),
476: :uri => nil,
477: :verb => :get,
478: :agent => self,
479: :redirects => 0,
480: :params => [],
481: :headers => {},
482: }.merge(params)
483:
484: before_connect = Chain.new([
485: Chain::URIResolver.new(@scheme_handlers),
486: Chain::ParameterResolver.new,
487: Chain::RequestResolver.new,
488: Chain::ConnectionResolver.new(
489: @connection_cache,
490: @keep_alive,
491: @proxy_addr,
492: @proxy_port,
493: @proxy_user,
494: @proxy_pass
495: ),
496: Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass),
497: Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest),
498: Chain::HeaderResolver.new(
499: @keep_alive,
500: @keep_alive_time,
501: @cookie_jar,
502: @user_agent,
503: @gzip_enabled,
504: @request_headers
505: ),
506: Chain::CustomHeaders.new,
507: @pre_connect_hook,
508: ])
509: before_connect.handle(options)
510:
511: uri = options[:uri]
512: request = options[:request]
513: cur_page = options[:referer]
514: request_data = options[:params]
515: redirects = options[:redirects]
516: http_obj = options[:connection]
517:
518: # Add If-Modified-Since if page is in history
519: if( (page = visited_page(uri)) && page.response['Last-Modified'] )
520: request['If-Modified-Since'] = page.response['Last-Modified']
521: end if(@conditional_requests)
522:
523: http_obj.mu_lock
524: # Specify timeouts if given
525: http_obj.open_timeout = @open_timeout if @open_timeout
526: http_obj.read_timeout = @read_timeout if @read_timeout
527: http_obj.start unless http_obj.started?
528:
529: # Log specified headers for the request
530: log.info("#{ request.class }: #{ request.path }") if log
531: request.each_header do |k, v|
532: log.debug("request-header: #{ k } => #{ v }")
533: end if log
534:
535: # Send the request
536: attempts = 0
537: begin
538: response = http_obj.request(request, *request_data) { |r|
539: connection_chain = Chain.new([
540: Chain::ResponseReader.new(r),
541: Chain::BodyDecodingHandler.new,
542: ])
543: connection_chain.handle(options)
544: }
545: rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x
546: log.error("Rescuing EOF error") if log
547: http_obj.finish
548: raise x if attempts >= 2
549: request.body = nil
550: http_obj.start
551: attempts += 1
552: retry
553: end
554:
555: after_connect = Chain.new([
556: @post_connect_hook,
557: Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set),
558: Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache),
559: ])
560: after_connect.handle(options)
561: http_obj.mu_unlock
562:
563: res_klass = options[:res_klass]
564: response_body = options[:response_body]
565: page = options[:page]
566:
567: log.info("status: #{ page.code }") if log
568:
569: if follow_meta_refresh
570: redirect_uri = nil
571: referer = page
572: if (page.respond_to?(:meta) && (redirect = page.meta.first))
573: redirect_uri = redirect.uri.to_s
574: sleep redirect.node['delay'].to_f
575: referer = Page.new(nil, {'content-type'=>'text/html'})
576: elsif refresh = response['refresh']
577: delay, redirect_uri = Page::Meta.parse(refresh, uri)
578: raise StandardError, "Invalid refresh http header" unless delay
579: if redirects + 1 > redirection_limit
580: raise RedirectLimitReachedError.new(page, redirects)
581: end
582: sleep delay.to_f
583: end
584: if redirect_uri
585: @history.push(page, page.uri)
586: return fetch_page(
587: :uri => redirect_uri,
588: :referer => referer,
589: :params => [],
590: :verb => :get,
591: :redirects => redirects + 1
592: )
593: end
594: end
595:
596: return page if res_klass <= Net::HTTPSuccess
597:
598: if res_klass == Net::HTTPNotModified
599: log.debug("Got cached page") if log
600: return visited_page(uri) || page
601: elsif res_klass <= Net::HTTPRedirection
602: return page unless follow_redirect?
603: log.info("follow redirect to: #{ response['Location'] }") if log
604: from_uri = page.uri
605: raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit
606: redirect_verb = options[:verb] == :head ? :head : :get
607: page = fetch_page( :uri => response['Location'].to_s,
608: :referer => page,
609: :params => [],
610: :verb => redirect_verb,
611: :redirects => redirects + 1
612: )
613: @history.push(page, from_uri)
614: return page
615: elsif res_klass <= Net::HTTPUnauthorized
616: raise ResponseCodeError.new(page) unless @user || @password
617: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
618: if response['www-authenticate'] =~ /Digest/i
619: @auth_hash[uri.host] = :digest
620: if response['server'] =~ /Microsoft-IIS/
621: @auth_hash[uri.host] = :iis_digest
622: end
623: @digest = response['www-authenticate']
624: else
625: @auth_hash[uri.host] = :basic
626: end
627: return fetch_page( :uri => uri,
628: :referer => cur_page,
629: :verb => request.method.downcase.to_sym,
630: :params => request_data,
631: :headers => options[:headers]
632: )
633: end
634:
635: raise ResponseCodeError.new(page), "Unhandled response", caller
636: end
# File lib/mechanize.rb, line 448
448: def post_form(url, form, headers = {})
449: cur_page = form.page || current_page ||
450: Page.new( nil, {'content-type'=>'text/html'})
451:
452: request_data = form.request_data
453:
454: log.debug("query: #{ request_data.inspect }") if log
455:
456: # fetch the page
457: page = fetch_page( :uri => url,
458: :referer => cur_page,
459: :verb => :post,
460: :params => [request_data],
461: :headers => {
462: 'Content-Type' => form.enctype,
463: 'Content-Length' => request_data.size.to_s,
464: }.merge(headers))
465: add_to_history(page)
466: page
467: end