I tires to build crawler that separate websites with problems as (redirected, not fond, website is down ...) from list. I added reachtextbox to store url list and another rtb to display html code for url-s. I need to rewrite some informations that belong to problem url-s and skip it in list to crawl other url-s. Some websites throws Exception as belove and stops crawelling process:
System.Net.Http.HttpRequestException: An error accurred while sending the request. ---> System.Net.WebException: The remote name could not be resolved: 'www.thexlr.com' at HttpWebRequest.EndGetResponse(IAsyncResult asyncResult) at HttpClientHandler.GetResponseCallback(IAsyncResult ar) ...
I do not know how to deal with problem websites. Do someone have any idea?
My code:
HttpClient clientKontrKat; HttpResponseMessage responseKontrKat; #region start button public async void buttonSpustitKontroluStranok_Click(object sender, EventArgs e) { richTextBoxKontrolaUrlWebObsah.Text = ""; richTextBoxKontrolaUrlVnutorneLinky.Text = ""; StringBuilder sb = new StringBuilder(); if (dataGridView1.Rows.Count > -1) { if (categoryComboBox3.Items.Count > -1 && categoryComboBox3.SelectedItem != null && statusComboBox1.Items.Count > -1 && statusComboBox1.SelectedItem != null) { string categItemCoPrehliadnut = categoryComboBox3.SelectedValue.ToString(); string statusItemCoOznacit = statusComboBox1.SelectedValue.ToString(); foreach (DataGridViewRow item in dataGridView1.Rows) { string ColumnCategItemsCoPrehliadnut = item.Cells[2].Value.ToString(); if (categItemCoPrehliadnut == ColumnCategItemsCoPrehliadnut) { string urlCoPrehliadnut = item.Cells[0].Value.ToString(); sb.Append(urlCoPrehliadnut + Environment.NewLine); } } } } richTextBoxKontrolaUrlVnutorneLinky.Text = sb.ToString(); richTextBoxKontrolaUrlVnutorneLinky.Text = string.Join(Environment.NewLine, richTextBoxKontrolaUrlVnutorneLinky.Lines.Distinct()); #region Crawler int i = 0; while (i < richTextBoxKontrolaUrlVnutorneLinky.Lines.Length - 1) { try { buttonSpustitKontroluStranok.Enabled = false; string lineUrl = richTextBoxKontrolaUrlVnutorneLinky.Lines[i].ToString(); HttpClientHandler handler = new HttpClientHandler(); handler.AllowAutoRedirect = false; clientKontrKat = new HttpClient(handler); clientKontrKat.MaxResponseContentBufferSize = 256000; clientKontrKat.DefaultRequestHeaders.Add("user-agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)"); responseKontrKat = await clientKontrKat.GetAsync(lineUrl); //responseKontrKat.EnsureSuccessStatusCode(); string responseBody = await responseKontrKat.Content.ReadAsStringAsync(); // Above three lines can be replaced with new helper method in following line // string body = await client.GetStringAsync(uri); labelKontrolaWebRespPopis.Text = "Web response popis: " + responseKontrKat.StatusCode; if (responseKontrKat.IsSuccessStatusCode) { richTextBoxKontrolaUrlWebObsah.Text = ""; richTextBoxKontrolaUrlWebObsah.Text = responseBody; } else { // problems handling here //if (richTextBoxKontrolaUrlVnutorneLinky.Lines.Contains(lineUrl)) //{ //MessageBox.Show("stop, " + lineUrl + " i: " + i); richTextBoxUrlsToCrawl.SelectionStart = i; richTextBoxUrlsToCrawl.SelectionLength = richTextBoxUrlsToCrawl.Text.IndexOf("\n", 0) + 1; richTextBoxUrlsToCrawl.SelectedText = ""; //} } i++; int pocetLiniekNaPrehliadnutie = richTextBoxKontrolaUrlVnutorneLinky.Lines.Count() - 1; labelPocetStranok.Text = "Pocet prehliadnutych stranok:" + i + " z " + pocetLiniekNaPrehliadnutie; // // MessageBox.Show("stop, " + lineUrl); // } //} } catch (WebException ex) { //if (ex.Status == WebExceptionStatus.ProtocolError) //{ // if (((HttpWebResponse)ex.Response).StatusCode == HttpStatusCode.NotFound) // { // // handle the 404 here // } //} //else if (ex.Status == WebExceptionStatus.NameResolutionFailure) { // handle name resolution failure MessageBox.Show("stop, "); } } catch (Exception ExKontStranok) { //if (responseKontrKat.IsSuccessStatusCode) //{ // return; //} //if (responseKontrKat.StatusCode == HttpStatusCode.NotFound) //if (ExKontStranok.GetBaseException().InnerException(WebException )==WebExceptionStatus .NameResolutionFailure) //{ MessageBox.Show("stop, "); richTextBoxUrlsToCrawl.SelectionStart = i; richTextBoxUrlsToCrawl.SelectionLength = richTextBoxUrlsToCrawl.Text.IndexOf("\n", 0) + 1; richTextBoxUrlsToCrawl.SelectedText = ""; //} MessageBox.Show("chyba:\n" + ExKontStranok.ToString()); } finally { buttonSpustitKontroluStranok.Enabled = true; } } #endregion MessageBox.Show("Urobene!"); } #endregion #region stop button private void buttonZastavitKontroluStranok_Click(object sender, EventArgs e) { clientKontrKat.CancelPendingRequests(); responseKontrKat.Dispose(); } #endregion
Thank you for help if any.